Health and Socioeconomic Factors Across Canada¶

Data 604 Final Report¶

Group Members

Bobbi Boyce
Kennedy Gunderson
Maxwell Paterson
Noah Seminoff

Setup¶

In [1]:
# imports
import pandas as pd
import numpy as np
import datetime as dt

# sql stuff
import sqlalchemy as sq
import csv
from sqlalchemy import text
import mysql.connector
from mysql.connector import Error

# plotting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import statsmodels
In [2]:
# connection details like password etc.
PASSWORD = "49dw2Wuyku6Df"
USERNAME = "student"
HOST = "localhost"
PORT = "3306"
DATABASENAME = "student"

# change these to your filepath
LABOURPATH = "Data/labour_force_data.csv"
CPIPATH = "Data/CPI_dataset.csv"
OPIATEPATH = "Data/opioid_data.csv"
DEATHSPATH = "Data/deaths.csv"
GEOJSONPATH = 'Data/canada-provinces.geojson'

#Creating engine
engine = sq.create_engine(f'mysql+mysqlconnector://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASENAME}')
In [3]:
def SQL(myquerystring):
    """ shorthand for pandas ready query """
    return pd.read_sql_query(myquerystring, engine)
In [4]:
# Noah's function definitions.
def pdsql(myquery) -> pd.DataFrame:
    """
    run SQL query and returns a dataframe
    """
    try:
        return pd.read_sql_query(myquery, engine)
    except Exception as e:
        return repr(e)

def runsql(command) -> None:
    """
    run SQL command in general. this one does not return a dataframe
    """
    try:
        with engine.connect() as cxn:
            cxn.execute(sq.text(command))
    except Exception as e:
        return repr(e)
    finally:
        return None

def droptable(tablename, needconfirmation=True) -> None:
    """
    tells SQL to drop the table if it exists already. asks for confirmation
    """
    # ask for confirmation
    if needconfirmation:
        areyousure = input(f'To drop table "{tablename}", type YES: ')
    else: # if running this function with ask=False
        areyousure = "YES"

    # execute drop table command
    if areyousure == "YES":
        runsql(f"DROP TABLE IF EXISTS {tablename};")
        print(f'table "{tablename}" dropped (or doesnt exist)')
    else:
        print(f'Wrong input, table "{tablename}" not dropped.')

    return None

def uploadLABOURdatabychunk(
    chunksize : int = 10**6,
    filename : str = "Data/14100287.csv",
    tablename : str = "labour",
    details : bool = True) -> None:
    """
    Read in the csv and upload to sql table in chunks.
    """

    # if we are running this function, want to make sure the table doesnt have data already
    droptable(tablename, needconfirmation=False)

    # open csv file to pandas dataframe in chunks. parse dates properly
    with pd.read_csv(filename,
                     parse_dates=['REF_DATE'],
                     date_format='%Y-%m',
                     chunksize=chunksize,
                     low_memory=False
                    ) as reader:
        # with file open and chunked, go through each chunk
        for i, chunk in enumerate(reader):
            """
            drop columns with only null values, or all the same value, or duplicate info:
            -DGUID is the same info as GEO
            -UOM_ID is the same info as UOM
            -SCALAR_FACTOR is either thousands if UOM is persons,
                or units if UOM is percentage
            -SCALAR_ID is the same info as scalar factor
            -VECTOR is a unique way of describing the combination of
                region/age/sex/etc, already have this info elsewhere
            -SYMBOL and TERMINATED have no actual values
            -DECIMALS is 1 for every row
            """
            chunk.drop(["DGUID",
                        "UOM_ID",
                        "SCALAR_FACTOR",
                        "SCALAR_ID",
                        "VECTOR",
                        "SYMBOL",
                        "TERMINATED",
                        "DECIMALS"],
                       axis=1, inplace=True)

            # rename columns for SQL (no spaces, more descriptive, etc)
            chunk.rename(columns={"REF_DATE":"date",
                                  "GEO":"region",
                                  "Labour force characteristics":"labourcharacteristics",
                                  "Sex":"sex",
                                  "Age group":"agegroup",
                                  "Statistics":"statistic",
                                  "Data type":"datatype",
                                  "UOM":"unit",
                                  "COORDINATE":"coordinate",
                                  "VALUE":"value",
                                  "STATUS":"status"}, inplace=True)

            # start upload this chunk
            if details: print("Starting upload chunk:", i+1)

            # take chunk, append to sql table if it exists (or create, if it doesnt)
            chunk.to_sql(tablename,
                         engine,
                         index=False,
                         if_exists='append',
                         chunksize=25000,
                         method=None)

            # done this chunk
            if details: print("Finished upload chunk:", i+1)

    # when finished,
    if details: print(f"All {i+1} chunks uploaded to table '{tablename}'")
    return None

def upload_labour_pivot(tablenamein,
                        tablenameout,
                        whichsex = 'both sexes',
                        whichagegroup = '15 years and over',
                        whichstatistic = 'estimate',
                        whichdatatype = 'unadjusted'):

    """
    takes input table, pivots based on value of labourcharacteristics, reuploads to SQL
    """
    labourpivotquery = f"""
    SELECT
        CONCAT(SUBSTRING(date, 1, 7), ':', region) AS dateregion,
        date,
        region,
        MAX(CASE WHEN labourcharacteristics = 'Population' THEN value END) "Population_thousands",
        MAX(CASE WHEN labourcharacteristics = 'Labour force' THEN value END) "LabourForce_thousands",
        MAX(CASE WHEN labourcharacteristics = 'Employment' THEN value END) "Employment_thousands",
        MAX(CASE WHEN labourcharacteristics = 'Full-time employment' THEN value END) "Fulltime_thousands",
        MAX(CASE WHEN labourcharacteristics = 'Part-time employment' THEN value END) "Parttime_thousands",
        MAX(CASE WHEN labourcharacteristics = 'Unemployment' THEN value END) "Unemployment_thousands",
        MAX(CASE WHEN labourcharacteristics = 'Unemployment rate' THEN value END) "Unemployment_percent",
        MAX(CASE WHEN labourcharacteristics = 'Participation rate' THEN value END) "Participation_percent",
        MAX(CASE WHEN labourcharacteristics = 'Employment rate' THEN value END) "Employment_percent"
    FROM
        {tablenamein}
    WHERE (
            sex = '{whichsex}'
        AND agegroup = '{whichagegroup}'
        AND statistic = '{whichstatistic}'
        AND datatype = '{whichdatatype}')
    GROUP BY
        date,
        region
    ORDER BY
        date
    """

    # turn into df
    labourpivotdf = pdsql(labourpivotquery)

    # interpret as datetime format
    labourpivotdf['date'] = pd.to_datetime(labourpivotdf['date'])
    # new column quarter based on regionYYYYQQ
    labourpivotdf['quarter'] = labourpivotdf["region"] + labourpivotdf['date'].dt.to_period('Q').astype(str)

    # df to SQL table
    labourpivotdf.to_sql(tablenameout, engine, index=False, if_exists = "replace")

def multi_line_plot(
    df : pd.DataFrame,             # df for your data
    columntuples : list[tuple],    # list of tuples of column names [(xcol,ycol), ...]
    serieslabels : list[str],      # series labels ['series1', 'series2']
    xlabel : str = '',             # x axis label
    ylabel : str = '',             # y axis label
    title : str = '',              # title label
    figsize : tuple = (1100, 500), # (width, height) for figure (in pixels)
    titlesize : int = 30           # font size for title
) -> tuple:
    """
    Plot multiple columns on the same figure
    """
    # create figure object
    fig = go.Figure()

    # loop over tuples, add each to plot
    for i, coltuple in enumerate(columntuples):
        fig.add_trace(go.Scatter(
            x = df[coltuple[0]],
            y = df[coltuple[1]],
            name = serieslabels[i]
        ))

    # add labels
    fig.update_layout(
        title =title,
        xaxis_title = xlabel,
        yaxis_title = ylabel
    )

    # make it look nice
    fig.update_xaxes(ticks = 'outside')
    fig.update_yaxes(ticks = 'outside')

    fig.update_layout(title_x = 0.5, title = dict(font=dict(size=titlesize)), width=figsize[0], height=figsize[1])

    # show figure
    fig.show()
    return None

def custom_scatter(
    xdata : pd.DataFrame,
    ydata : pd.DataFrame,
    markersize : int = 7,
    colordata : list = None,
    cbartitle : str = '',
    colorscale : str = 'Viridis',
    titlelabel : str = '',
    xlabel : str = '',
    ylabel : str = '',
    titlesize : int = 30,
    plotwidth : int = 1100,
    plotheight : int = 500,
    xrange : list[int] = None,
    yrange : list[int] = None
) -> None:
    """
    custom plotly scatterplot function
    """
    # create figure object
    fig = go.Figure()

    # add data to figure objc
    fig.add_trace(go.Scatter(
        x = xdata,
        y = ydata,
        marker=dict(
            size=markersize,
            color=colordata,
            colorbar=dict(
                title=dict(text=cbartitle)
            ),
            colorscale=colorscale
        ),
        mode='markers'
    ))

    # add labels
    fig.update_layout(
        title =titlelabel,
        xaxis_title = xlabel,
        yaxis_title = ylabel,
        xaxis_range = xrange,
        yaxis_range = yrange
    )

    # make it look nice
    fig.update_xaxes(ticks = 'outside')
    fig.update_yaxes(ticks = 'outside')

    fig.update_layout(title_x = 0.5, title = dict(font=dict(size=titlesize)), width=plotwidth, height=plotheight)

    # show figure
    fig.show()
    return None

# choropleth plot function definition
def choroplot(
    dataframe : pd.DataFrame,                                        # your dataframe by province
    data_whichcolumn : str,                                          # column name you want to plot
    prov_filename = GEOJSONPATH,                                     # filename for geographic province boundaries
    prov_columns_keep = ['geo_point_2d','prov_name_en','geometry'],  # what columns in geo file to keep
    prov_regionnamecolumn = 'prov_name_en',                          # geo file 'province name' column
    data_regionnamecolumn = 'region',                                # your df 'province name' column
    figax : tuple = (None, None),                                    # pass in custom tuple of (fig, ax) objects for plotting onto
    plotsize = (10,8),                                               # figsize for matplotlib
    plottitlename = '',                                              # plot title text
    plotlegendname = '',                                             # plot legend text
    plotlegendorientation = 'vertical',                              # colorbar vertical or horizontal
    datacolormap = 'OrRd',                                           # colormap name
    vmin = None,                                                     # optional colormap minimum value
    vmax = None,                                                     # optional colormap maximal value
    missingcolor = 'lightgrey',                                      # NaN value color
    legendon = True,                                                 # Bool : legend on/off
    boundaryon = False,                                              # Province outlines on/off
    axisoff = True                                                   # plot axis' on/off (x and y axis are lat/long)
    ) -> None:                                                       # returns -> None
    """
    geo file from:
    Provinces and territories—Canada. (n.d.). Retrieved December 3, 2024, from https://data.opendatasoft.com/explore/dataset/georef-canada-province@public/information/?disjunctive.prov_name_en

    merges geo data and provided dataframe into one df, and then plot a choropleth of the specified dataframe column (uses matplotlib)

    """
    import geopandas as gpd

    # get geo bounds of provinces, throw out french names etc
    prov_data = gpd.read_file(prov_filename)
    prov_data = prov_data[prov_columns_keep]

    # merge provincial data and geographic data into one DF
    merged_geo = prov_data.merge(dataframe, how='left', left_on=prov_regionnamecolumn,right_on=data_regionnamecolumn).set_index(data_regionnamecolumn)

    # setup plot. If didn't provide a figure and axis, make one
    if figax == (None, None):
        fig, ax = plt.subplots(1,1, figsize=plotsize)
    else:
        # pull out figure and axis from figax input
        fig, ax = figax

    # province outline
    if boundaryon:
        merged_geo.boundary.plot(ax=ax, edgecolor="black")

    # df.plot(...parameters...)
    merged_geo.plot(
        column=data_whichcolumn, # which column to plot
        ax = ax, # axis object to plot onto
        missing_kwds={"color":missingcolor}, # missing values
        cmap = datacolormap, # colomap
        vmin = vmin,
        vmax = vmax,
        legend = legendon, # legend on or off
        legend_kwds = {'label':plotlegendname, 'orientation':plotlegendorientation}) # legend keywords

    ax.set_aspect(2) # aspect ratio
    if axisoff: ax.set_axis_off() # turn off x and y axis markers

    ax.set_title(plottitlename)

    return fig, ax

Introduction¶

We believed that this research topic was important to hopefully better understand the health and well-being of Canadian citizens and how it varies based on different economic factors. This topic seems incredibly relevant to our current time due to the COVID-19 pandemic and the ongoing rise in the price of living. These two things greatly altered the labour force statistics throughout Canada as well as the consumer price index for many different essential and non-essential items. We want to see if this fluctuation in CPI as well as labour fource metrics has any correlation to deaths of despair, such as drug overdoses and suicides. If we can find causation between one of the metrics we are going to look at and overdoses or suicides, government policies could target these areas to reduce the amount of unnecessary deaths that occur in Canada. Of course, this is a very convoluted area of research, and it is difficult to conclude there is causation between any of these metrics; there are numerous factors influencing the relationship between our data sets, and how they come to impact Canadians in reality.

In previous research, it has been discovered that suicide rates among a population may not increase when layoffs occur. However, one set of researchers has concluded that when mass layoff occurs in an area, there is an increase in suicides. This could be due to increased difficulties in paying for necessities, with the increasing cost of living, or the daunting task of finding new employment. They have also found that suicides increase among populations as they are unemployed for longer periods of time. This shows that we may not be able to find a direct link between the labour force characteristics and suicides as the suicides may lag behind the unemployment rate. However, this is an important metric to explore, as mental health support could be increased for unemployed individuals (Classen and Dunn).

There was a study of 536 participants in Canada looking into the correlation between drug usage and housing status. This research by the NIH found that there is a relationship between housing status and perceived drug usage. Specifically, they found that homeless youth especially were more likely to engage in risky and frequent drug use compared to youths that were not homeless. Importantly, this report is also careful in saying that there is a causation between the two and they state that this research does have its limitations (Cheng et al.). However, it does show us that there is a possible relationship between housing status and drug use.

Economic uncertainty and suicides have been found to be positively associated even when controlling for unemployment. Sotiris and Kawachi from Harvard have found that within the United States, an increase in the economic policy uncertainty index by one percent is associated with a 0.33-0.72% increase in suicides per month (Vandoros and Kawachi). Although this is not directly related to CPI, and instead focuses on the economic policy uncertainty index, CPI is a factor that contributes to the changes in the economic policy uncertainty index (Alex Nae and Mark Barnes).


The research questions that we have decided to look into are the following four questions.

1. Is there a correlation between shelter price and opioid-related deaths?

This guiding question will be working with the CPI dataset as well as the opioid and stimulant-related harms dataset.

2. Are there trends in the cost of healthcare and the price of goods and services in relation to causes of death?

This guiding question is going to be working with the consumer index and death count datasets.

3. Is there fluctuation in the price of goods as unemployment rates change, and does this differs between recreational goods such as alcohol versus necessities such as shelter?

This guiding question is going to use the consumer index dataset as well as the labour force characteristics dataset.

4. Is there correlation between unemployment rates, opioid-related deaths and deaths caused by suicide?

This guiding question relates to three datasets, including the labour force characteristics dataset, opioid and stimulant-related harms dataset, and death count dataset.

Individual Datasets¶

Opioid and Stimulant-Related Harms in Canada (Max)¶

Relevance

The first dataset that we chose was the opioid and stimulant-related harms in Canada. This data set is going to be used to give us insight into the opioid crisis in Canada. We chose this dataset because it seemed to be the most complete dataset on drug use in Canada. It was also grouped by region which was a metric we were hoping could be universal for all of our datasets so we could look at each province specifically. I think looking into drug use in Canada is a very important topic, as opioid usage has greatly increased due to the COVID-19 pandemic and continues to increase in Canada and other countries (Canada Health).

Findings

From the following analysis of the opiate data set, we have learned that opioid usage has increased significantly since 2019 and is increasing steadily. We see this due to one of the queries where we are looking at the most deaths per quarter in descending order, the top 15 most deadly quarters are all from 2020 or newer. This shows a worrying trend with opiate usage and specifically deaths in Canada.

We see that the provinces with the most overall deaths from opioid use during the timeframe that the dataset spans are Ontario, British Columbia and Alberta. These three tower over all of the other provinces. Quebec has a surprisingly low amount of overdoses compared to its population. Quebec is the third most populated province in the Country but has the 4th largest amount of opioid deaths.

We also got to see the range of amounts of deaths per quarter for each of the provinces. This helps us to see how large or small some quarters for certain provinces have been. For example, Ontario's worst month had 735 opioid-related deaths, whereas its best was 195. This gives us an idea of the increase in usage in each respective province.

Lastly, we looked at the difference between the total opioid-related deaths and the accidental opioid-related deaths as the difference between these two numbers represents the number of suicides by opioids, theoretically. Of course, this number may be flawed as after death it is most likely hard to determine whether the overdose was intentional or not, but it does give some insight into how these deaths occur. We see that out of Canada's 47,058 deaths during this time frame, there were 2,575 suspected suicides by opioids, which is around 5.4%.

Analysis and Cleaning Steps

I am going to start out by importing and cleaning the data that I am working with. This is a very important step as the data is not in the form that we want it to be in and also has some data that we are not going to need in the final project. This will be done mainly using pandas and then imported into the SQL database.

The first step in this process is to retrieve the data that we are going to be working with from the CSV file that was provided. We retrieved the data from the Government of Canada under the Statistics Canada open license and was most recently updated in September of 2024.

In [5]:
originialopiatedata = pd.read_csv(OPIATEPATH)


We are going to be working with this data looking at only the quarterly data, it also provides yearly data but we are not going to be needing it so we are going to filter it to only give us the quarterly data. We also want to look at the overall numbers and not the other ways that the data measures things so we are going to filter for only those as well. This will leave us with the correct time periods for the data as well as the correct values that we want.

In [6]:
opiatedata = originialopiatedata[(originialopiatedata['Time_Period'] == "By quarter") & (originialopiatedata['Specific_Measure'] == "Overall numbers")].reset_index(drop=True)


We are now going to drop some columns that we are not going to need, due to the fact that they are redundant or not useful to us for this project.

In [7]:
opiatedata.drop(columns=["Source", "Specific_Measure", "PRUID", "Time_Period", "Aggregator", "Disaggregator", "Unit", "Substance"], inplace = True)


Now we need to create a new column that lists both the region as well as the year and quarter that the data is from, this is going to be used as the unique identifier across the data sets and thus it is important that we add it now.

In [8]:
opiatedata["Region_Year_Quarter"] = opiatedata["Region"] + opiatedata["Year_Quarter"].str.replace(" ", "")


While I have all the values in the same column, I am going to take this opportunity to clean all of the entries in this column as we see some values which were supressed by the region or are n/a and thus are going to all be transformed into a numpy nan number to have them all consistent.

In [9]:
opiatedata["Value"] = opiatedata["Value"].replace({"Suppr.":np.nan, "n/a":np.nan})


Now, the most important part of cleaning our data is pivoting the data set so that all the metrics that were in the 'Type_Event" column become columns of their own and thus make it so that each 'Region_Year_Quarter' entry is a unique value. We are going to do this using the pandas pivot function which greatly helps us in completing this step.

In [10]:
opiatedata = opiatedata.pivot(index = ["Region", "Year_Quarter", "Region_Year_Quarter"], columns = "Type_Event", values = "Value").reset_index()

From the result above we see what our data now looks like. It it is much closer to the form that we want it in and we are almost done cleaning the data.

We also need to rename all of the columns so that there are not any spaces in them by replacing them with underscores.

In [11]:
opiatedata = opiatedata.rename(columns = {"Accidental apparent opioid toxicity deaths":"Accidental_opioid_deaths",
                              "Accidental apparent stimulant toxicity deaths":"Accidental_stimulant_deaths",
                              "Accidental opioid-related poisoning ED visits":"Accidental_opioid_poisoning_ED_visits",
                              "Accidental opioid-related poisoning hospitalizations":"Accidental_opioid_poisoning_hospitalizations",
                              "Accidental stimulant-related poisoning ED visits":"Accidental_stimulant_poisoning_ED_visits",
                              "Accidental stimulant-related poisoning hospitalizations":"Accidental_stimulant_poisoning_hospitalizations",
                              "EMS responses to suspected opioid-related overdoses":"EMS_responses_to_opioid_overdoses",
                              "Total apparent opioid toxicity deaths":"Total_opioid_toxicity_deaths",
                              "Total apparent stimulant toxicity deaths":"Total_stimulant_toxicity_deaths",
                              "Total opioid-related poisoning ED visits":"Total_opioid_poisoning_ED_visits",
                              "Total opioid-related poisoning hospitalizations":"Total_opioid_poisoning_hospitalizations",
                              "Total stimulant-related poisoning ED visits":"Total_stimulant_poisoning_ED_visits",
                              "Total stimulant-related poisoning hospitalizations":"Total_stimulant_poisoning_hospitalizations"})


Additionally we are going to make sure that all of the columns are of a certain type to ensure consistency among the data. This can be done by turning all the columns with numbers in them into floats. We are using floats because integers do not allow any NaN values, and I want to keep the NaN values but ensure the values that are actually numbers are floats.

In [12]:
opiatedata.Accidental_opioid_deaths = opiatedata.Accidental_opioid_deaths.astype('float')
opiatedata.Accidental_stimulant_deaths = opiatedata.Accidental_stimulant_deaths.astype('float')
opiatedata.Accidental_opioid_poisoning_ED_visits = opiatedata.Accidental_opioid_poisoning_ED_visits.astype('float')
opiatedata.Accidental_opioid_poisoning_hospitalizations = opiatedata.Accidental_opioid_poisoning_hospitalizations.astype('float')
opiatedata.Accidental_stimulant_poisoning_ED_visits = opiatedata.Accidental_stimulant_poisoning_ED_visits.astype('float')
opiatedata.Accidental_stimulant_poisoning_hospitalizations = opiatedata.Accidental_stimulant_poisoning_hospitalizations.astype('float')
opiatedata.EMS_responses_to_opioid_overdoses = opiatedata.EMS_responses_to_opioid_overdoses.astype('float')
opiatedata.Total_opioid_toxicity_deaths = opiatedata.Total_opioid_toxicity_deaths.astype('float')
opiatedata.Total_stimulant_toxicity_deaths = opiatedata.Total_stimulant_toxicity_deaths.astype('float')
opiatedata.Total_opioid_poisoning_ED_visits = opiatedata.Total_opioid_poisoning_ED_visits.astype('float')
opiatedata.Total_opioid_poisoning_hospitalizations = opiatedata.Total_opioid_poisoning_hospitalizations.astype('float')
opiatedata.Total_stimulant_poisoning_ED_visits = opiatedata.Total_stimulant_poisoning_ED_visits.astype('float')
opiatedata.Total_stimulant_poisoning_hospitalizations = opiatedata.Total_stimulant_poisoning_hospitalizations.astype('float')


The data cleaning process for this data should be complete, just to make sure that we have sucessfully created a unique row with the "Region_Year_Quarter" row we are going to count how many rows there are as well as count how many unique rows there are. If the numbers are the same then we know that each and every row has a unique identifier that we will be able to use in the project.

In [13]:
print(opiatedata["Region_Year_Quarter"].nunique())
print(opiatedata["Region_Year_Quarter"].count())
611
611

These outputs confirm that we have done the correct cleaning steps and have ended up with a data set that contains unique identifiers.

We can now move onto the SQL portion as the data is ready to be imported and queried. We are going to start by creating the connection, as well as the engine for the SQL server.

Opiate Dataset Analysis

To start we need to create the table in our database in order to work on it through SQL queries.

In [14]:
opiatedata.to_sql("Opiates", if_exists='replace', con=engine, method=None, index=False)
Out[14]:
611


The following SQL query is used to grab all of the columns from the opiates table for only the regions that are provinces. There is some overlap in the data where there is Northwest Territories data and also Yellowknife data, for example. This is going through and making sure that the only regions that come through are provinces. We have also excluded Canada in this query.

This is going to be helpful for the project because when we are looking at and comparing data, the odds are good that the other data set regions only contain the provinces and not the other odd regions that were included in this data set.

In [15]:
alldataprovinces = SQL("""
    select *
    from Opiates
    where region != 'Territories'
        and region != 'Whitehorse, Yukon'
        and region != 'Winnipeg, Manitoba'
        and region != 'Yellowknife, Northwest Territories'
        and region != 'Northern and rural Manitoba'
        and region != 'Canada'
    """)

alldataprovinces
Out[15]:
Region Year_Quarter Region_Year_Quarter Accidental_opioid_deaths Accidental_stimulant_deaths Accidental_opioid_poisoning_ED_visits Accidental_opioid_poisoning_hospitalizations Accidental_stimulant_poisoning_ED_visits Accidental_stimulant_poisoning_hospitalizations EMS_responses_to_opioid_overdoses Total_opioid_toxicity_deaths Total_stimulant_toxicity_deaths Total_opioid_poisoning_ED_visits Total_opioid_poisoning_hospitalizations Total_stimulant_poisoning_ED_visits Total_stimulant_poisoning_hospitalizations
0 Alberta 2016 Q1 Alberta2016Q1 118.0 NaN 504.0 135.0 291.0 39.0 NaN 137.0 NaN 757.0 229.0 433.0 80.0
1 Alberta 2016 Q2 Alberta2016Q2 130.0 NaN 649.0 140.0 392.0 54.0 NaN 140.0 NaN 918.0 239.0 564.0 101.0
2 Alberta 2016 Q3 Alberta2016Q3 144.0 NaN 700.0 156.0 383.0 51.0 NaN 159.0 NaN 948.0 239.0 551.0 97.0
3 Alberta 2016 Q4 Alberta2016Q4 155.0 NaN 723.0 127.0 348.0 47.0 NaN 166.0 NaN 1002.0 223.0 510.0 92.0
4 Alberta 2017 Q1 Alberta2017Q1 152.0 NaN 786.0 151.0 356.0 54.0 567.0 162.0 NaN 1027.0 235.0 520.0 100.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
424 Yukon 2023 Q1 Yukon2023Q1 3.0 2.0 10.0 NaN NaN NaN NaN 3.0 2.0 12.0 NaN NaN NaN
425 Yukon 2023 Q2 Yukon2023Q2 5.0 5.0 28.0 NaN 7.0 NaN NaN 5.0 5.0 30.0 NaN 8.0 NaN
426 Yukon 2023 Q3 Yukon2023Q3 2.0 2.0 9.0 NaN 5.0 NaN NaN 2.0 2.0 10.0 NaN 6.0 NaN
427 Yukon 2023 Q4 Yukon2023Q4 7.0 7.0 6.0 NaN NaN NaN NaN 7.0 7.0 9.0 NaN NaN NaN
428 Yukon 2024 Q1 Yukon2024Q1 4.0 2.0 10.0 NaN 7.0 NaN NaN 4.0 2.0 17.0 NaN 9.0 NaN

429 rows × 16 columns


The proceeding SQL query looks at the total amount of deaths from opioids grouped by quarter and then put into descending order, and limited so we only see the top five quarters. This is going to give us insight into when the most deaths from opioids occurred and can possibly help us in determining which time frames are important to see if there are any correlations between the prices of goods and an increase in deaths caused by opioids.

In [16]:
mostdeathsperquarter = SQL("""
    select year_quarter,
    cast(sum(Total_opioid_toxicity_deaths) as int) as Total_Deaths
    from Opiates
    where region != 'Territories'
        and region != 'Whitehorse, Yukon'
        and region != 'Winnipeg, Manitoba'
        and region != 'Yellowknife, Northwest Territories'
        and region != 'Northern and rural Manitoba'
        and region != 'Canada'
    group by year_quarter
    order by Total_Deaths desc
    limit 20""")

mostdeathsperquarter
Out[16]:
year_quarter Total_Deaths
0 2023 Q2 2176
1 2021 Q4 2171
2 2023 Q4 2137
3 2023 Q1 2075
4 2023 Q3 2071
5 2022 Q4 2060
6 2022 Q1 2036
7 2021 Q3 1951
8 2024 Q1 1906
9 2021 Q1 1883
10 2021 Q2 1874
11 2020 Q4 1870
12 2022 Q3 1848
13 2022 Q2 1832
14 2020 Q3 1788
15 2020 Q2 1686
16 2018 Q4 1128
17 2018 Q3 1100
18 2019 Q2 1060
19 2017 Q3 1057


In addition to looking at the quarters with the most deaths attributed to them, we are also going to use the following code to find the regions with the most deaths attributed to them. This helps us see how many opioid deaths occur per region and makes us aware of the regions that have the most deaths and may be the best regions to look for trends

In [17]:
totaldeathsperprovince = SQL("""
    select Region,
    cast(sum(Total_opioid_toxicity_deaths) as int) as Total_Opioid_Deaths
    from Opiates
    where region != 'Territories'
        and region != 'Whitehorse, Yukon'
        and region != 'Winnipeg, Manitoba'
        and region != 'Yellowknife, Northwest Territories'
        and region != 'Northern and rural Manitoba'
        and region != 'Canada'
    group by region
    order by Total_Opioid_Deaths desc
    """)

totaldeathsperprovince
Out[17]:
Region Total_Opioid_Deaths
0 Ontario 16300
1 British Columbia 14260
2 Alberta 9363
3 Quebec 2816
4 Saskatchewan 1637
5 Manitoba 1520
6 Nova Scotia 472
7 New Brunswick 405
8 Newfoundland and Labrador 157
9 Yukon 96
10 Northwest Territories 26
11 Prince Edward Island 6
12 Nunavut 0


The following code is used to look at each of the regions/provinces and record the minimum and the maximum amount of opioid deaths in one quarter. This will give us some insight into the range of numbers that we should expect as well as confirm that there are no obvious outliers in our data. For example, this makes sure that we do not have any super high numbers in the data that are obviously wrong. At the same time, it gives us a good overview of the data and shows us the common range for each region's opioid death count.

In [18]:
minandmaxperprovince = SQL("""
    select region,
    cast(max(Total_opioid_toxicity_deaths) as int) as Max_Deaths_Quarter,
    cast(min(Total_opioid_toxicity_deaths) as int) as Min_Deaths_Quarter
    from Opiates
    where region != 'Territories'
        and region != 'Whitehorse, Yukon'
        and region != 'Winnipeg, Manitoba'
        and region != 'Yellowknife, Northwest Territories'
        and region != 'Northern and rural Manitoba'
        and region != 'Canada'
    group by region
    order by Max_Deaths_Quarter desc, Min_Deaths_Quarter asc
    """)

minandmaxperprovince
Out[18]:
region Max_Deaths_Quarter Min_Deaths_Quarter
0 Ontario 735 195
1 British Columbia 681 153
2 Alberta 512 125
3 Quebec 173 42
4 Saskatchewan 113 17
5 Manitoba 87 10
6 Nova Scotia 30 5
7 New Brunswick 24 4
8 Newfoundland and Labrador 12 5
9 Yukon 10 0
10 Prince Edward Island 6 0
11 Northwest Territories 3 0
12 Nunavut 0 0


Lastly, we are going to take a look at the quarters and return all of the quarters which had more deaths than average caused by opioid toxicity. This may mark significant times and regions to take a closer look at as the deaths were higher than average. Of course in this query, we see lots of Ontario quarters. This is because of its high population which creates a much higher chance to be above the overall average. It would be a great idea to do this same process for each province to see the time frames inside a province that is higher than that province's average.

In [19]:
morethanaveragequarters = SQL("""
    select Region_Year_Quarter,
    cast(Total_opioid_toxicity_deaths as int) as Total_opioid_toxicity_deaths
    from Opiates
    where region != 'Territories'
        and region != 'Whitehorse, Yukon'
        and region != 'Winnipeg, Manitoba'
        and region != 'Yellowknife, Northwest Territories'
        and region != 'Northern and rural Manitoba'
        and region != 'Canada'
        and Total_opioid_toxicity_deaths > (select avg(Total_opioid_toxicity_deaths) from Opiates
                                            where region != 'Territories'
                                            and region != 'Whitehorse, Yukon'
                                            and region != 'Winnipeg, Manitoba'
                                            and region != 'Yellowknife, Northwest Territories'
                                            and region != 'Northern and rural Manitoba'
                                            and region != 'Canada')
    group by Total_opioid_toxicity_deaths desc
    """)

morethanaveragequarters
Out[19]:
Region_Year_Quarter Total_opioid_toxicity_deaths
0 Ontario2021Q1 735
1 Ontario2021Q2 733
2 Ontario2020Q4 725
3 Ontario2021Q4 709
4 Ontario2023Q2 691
... ... ...
93 Alberta2016Q1 137
94 Quebec2022Q2 136
95 Quebec2023Q3 127
96 Quebec2023Q2 126
97 Alberta2019Q4 125

98 rows × 2 columns


The following code is being used to calculate the number of suspected intentional opioid overdoses per region for the entire time frame. This could be a telling metric if we want to compare something like the price of mental health facilities to the number of suspected suicides by opioid overdose. It is possible that this metric may have a more significant correlation to changes in things such as prices of groceries and since these are the deaths which are suspected to be intentional, one may expect that there is a reason behind it.

In [20]:
intentionalopioiddeaths = SQL("""
    select concat(region,' has a total of ', sum(Total_opioid_toxicity_deaths) - sum(Accidental_opioid_deaths) ,' suspected intentional opioid overdoses') as 'Suspected intentional opioid overdoses'
    from Opiates
    where Total_opioid_toxicity_deaths > 0
    and Accidental_opioid_deaths > 0
    group by region
    """)

intentionalopioiddeaths
Out[20]:
Suspected intentional opioid overdoses
0 Alberta has a total of 181 suspected intention...
1 British Columbia has a total of 268 suspected ...
2 Canada has a total of 2575 suspected intention...
3 Manitoba has a total of 244 suspected intentio...
4 New Brunswick has a total of 52 suspected inte...
5 Newfoundland and Labrador has a total of 21 su...
6 Northwest Territories has a total of 1 suspect...
7 Nova Scotia has a total of 75 suspected intent...
8 Ontario has a total of 1146 suspected intentio...
9 Prince Edward Island has a total of 0 suspecte...
10 Quebec has a total of 464 suspected intentiona...
11 Saskatchewan has a total of 79 suspected inten...
12 Yukon has a total of 2 suspected intentional o...

Monthly Consumer Price Index (Kennedy)¶

Relevance

The consumer price index dataset contains information about the prices of goods and services from 1914-2024 and is updated monthly. This data set is free to use under the Statistics Canada open government License. We chose this data set becasue it aligned with our topic of interest: Health and socioeconomic factors across Canada. This data set has interesting implications for Canadians' lives, specifically in the affordability of everyday items and services. Determining if the cost of goods and services interacts with the other data sets we selected may provide insights into where government initiatives should be implemented. More specifically, the CPI data set is being analyzed in conjunction with the opioids, labour and provisional deaths data sets.

Analysis and Cleaning Steps

The CPI data set was not originally in a format that would've allowed for the implementation of joins. As a result, cleaning and pivoting from long-format tables to wide-format tables was necessary. The following code chunks display how cleaning was performed.

In [21]:
#reading in data, and changing column names
prices_df = pd.read_csv(CPIPATH)
prices_df.rename(columns={"Products and product groups":"product", 'TERMINATED': 'term'}, inplace=True)

Since the CPI data set is so large, the data needs to be added to a table in chunks. For this portion of the project, this was easily achieved by breaking up the data set into three data frames and adding them to the CPI_initial table in SQL. The table labelled 'CPI_initial' contains the data in its long form.

In [22]:
#splitting data into chunks to add to an initial table
prices_df_first = prices_df.iloc[1:46864]
prices_df_mid = prices_df.iloc[46864:93725]
prices_df_last = prices_df.iloc[93725:140593]

# adding CPI to CPI initial
prices_df_first.to_sql('CPI_initial', engine, index=False,if_exists='replace')
prices_df_mid.to_sql('CPI_initial', engine, index=False,if_exists='append')
prices_df_last.to_sql('CPI_initial', engine, index=False,if_exists='append')
Out[22]:
46867

Following this, I dropped any columns that were not necessary for the initial data exploration. These columns were the STATUS, SYMBOL, and term, which all contained null values.

In [23]:
#Dropping unnecessary columns
with engine.connect() as connection:
    connection.execute(text("ALTER TABLE CPI_initial DROP COLUMN STATUS;"))

with engine.connect() as connection:
    connection.execute(text("ALTER TABLE CPI_initial DROP COLUMN SYMBOL;"))

with engine.connect() as connection:
    connection.execute(text("ALTER TABLE CPI_initial DROP COLUMN term;"))

Below is the format of the CPI_initial table. The main columns we will want to use for the CPI analysis are date (REF_DATE), province (GEO), product (product), and value (VALUE).

In [24]:
explore_data = pd.read_sql_query("SELECT * FROM CPI_initial LIMIT 10", engine)
explore_data
Out[24]:
REF_DATE GEO DGUID product UOM UOM_ID SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE DECIMALS
0 1978-10 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 48.1 1
1 1978-11 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 48.1 1
2 1978-12 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 48.2 1
3 1979-01 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 48.8 1
4 1979-02 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 49.8 1
5 1979-03 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 51.0 1
6 1979-04 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 52.1 1
7 1979-05 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 53.2 1
8 1979-06 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 53.4 1
9 1979-07 Newfoundland and Labrador 2016A000210 Food purchased from stores 2002=100 17 units 0 v41691246 3.4 55.0 1

To make a table that is able to be joined across all four data sets, an SQL query was implemented which takes each product type and converts it into its own column. At the same time, a unique key was created for each row by combining the province and the date in months (REF_DATE: GEO). This creates a wide format table which can be successfully joined with all of the other data sets.

In [25]:
#Consulted source:
#https://dba.stackexchange.com/questions/164711/how-to-pivot-rows-into-columns-mysql

pivoteddf = pd.read_sql_query("""
SELECT
    CONCAT(REF_DATE, ':', GEO) AS `year_month_region`,
    MAX(CASE WHEN product = 'Food purchased from stores' THEN VALUE END) AS "Food_purchased_from_stores",
    MAX(CASE WHEN product = 'Food purchased from restaurants' THEN VALUE END) AS "Food__purchased_from_restaurants",
    MAX(CASE WHEN product = 'Rented accommodation' THEN VALUE END) AS "Rented_accommodation",
    MAX(CASE WHEN product = 'Owned accommodation' THEN VALUE END) AS "Owned_accommodation",
    MAX(CASE WHEN product = 'Water, fuel and electricity' THEN VALUE END) AS "Water_fuel_and_electricity",
    MAX(CASE WHEN product = 'Household operations' THEN VALUE END) AS "Household_operations",
    MAX(CASE WHEN product = 'Household furnishings and equipment' THEN VALUE END) AS "Household_furnishings_and_equipment",
    MAX(CASE WHEN product = 'Clothing' THEN VALUE END) AS "Clothing",
    MAX(CASE WHEN product = 'Footwear' THEN VALUE END) AS "Footwear",
    MAX(CASE WHEN product = 'Clothing accessories, watches and jewellery' THEN VALUE END) AS "Clothing_accessories_watches_jewellery",
    MAX(CASE WHEN product = 'Clothing material, notions and services' THEN VALUE END) AS "Clothing_material_notions_services",
    MAX(CASE WHEN product = 'Public transportation' THEN VALUE END) AS "Public_transportation",
    MAX(CASE WHEN product = 'Gasoline' THEN VALUE END) AS "Gasoline",
    MAX(CASE WHEN product = 'Private transportation' THEN VALUE END) AS "Private_transportation",
    MAX(CASE WHEN product = 'Health care' THEN VALUE END) AS "Health_care",
    MAX(CASE WHEN product = 'Personal care' THEN VALUE END) AS "Personal_care",
    MAX(CASE WHEN product = 'Recreation' THEN VALUE END) AS "Recreation",
    MAX(CASE WHEN product = 'Education and reading' THEN VALUE END) AS "Education_and_reading",
    MAX(CASE WHEN product = 'Alcoholic beverages' THEN VALUE END) AS "Alcoholic_beverages",
    MAX(CASE WHEN product = 'Tobacco products and smokers'' supplies' THEN VALUE END) AS "Tobacco_smokers_supplies",
    MAX(CASE WHEN product = 'Recreational cannabis (201812=100)' THEN VALUE END) AS "Recreational_cannabis",
    MAX(CASE WHEN product = 'Goods' THEN VALUE END) AS "Goods",
    MAX(CASE WHEN product = 'Services' THEN VALUE END) AS "Services"
FROM CPI_initial
GROUP BY `year_month_region`
ORDER BY `year_month_region`;
""", engine)

Although this query solves the main barrier we faced with these data sets, it removes the ability to create visualizations based on the year or province without separating the key. To remedy this, I added new columns for the year and the province by splitting up the key. Simultaneously, this creates the ability to add new keys for any analysis that uses quarterly dates.

In [26]:
#creating two new columns by splitting up the key
pivoteddf[['Year', 'Province']] = pivoteddf['year_month_region'].str.split(':', expand = True)

To make the table completely accessible to each group member for analysis, the following code creates a quarter column and a quarterly key. Group members can then average CPI product values as needed.

In [27]:
pivoteddf['Year'] = pd.to_datetime(pivoteddf['Year'])
pivoteddf['Quarter'] = pivoteddf['Year'].dt.to_period('Q')
pivoteddf['Quarter'] = pivoteddf['Quarter'].astype(str)
pivoteddf['Region_Year_Quarter'] = pivoteddf['Province'] + pivoteddf['Quarter']
In [28]:
pivoteddf.to_sql('CPI_table_quarter', engine, index=False,if_exists='replace')
Out[28]:
6540

Findings

To understand the CPI data set and its characteristics as extensively as possible, a preliminary analysis of general trends was conducted.

The following code creates a visualization for the price of alcohol for the years 1979, 2002, and 2023 across all provinces. This has relevance to the affordability of alcoholic beverages, but also in the regulation and attempt to reduce alcoholic-related harms.

In [29]:
#getting CPI values for alcohol price for 1979
query_CPI_1 = pd.read_sql_query(
    "SELECT REF_DATE, product, GEO, AVG(VALUE) AS average_value "
    "FROM CPI_initial "
    "WHERE product = 'Alcoholic beverages' AND REF_DATE LIKE '%1979%'"
    "GROUP BY GEO;",
    engine
)

#getting CPI values for alcohol price for 2002
query_CPI_2 = pd.read_sql_query(
"""SELECT REF_DATE, product, GEO, AVG(VALUE) AS average_value
FROM CPI_initial
WHERE product = 'Alcoholic beverages' AND REF_DATE LIKE '%2002%'
GROUP BY GEO;""", engine
)


#getting CPI values for alcohol price for 2023
query_CPI_3 = pd.read_sql_query(
"""SELECT REF_DATE, product, GEO, AVG(VALUE) AS average_value
FROM CPI_initial
WHERE product = 'Alcoholic beverages' AND REF_DATE LIKE '%2023%'
GROUP BY GEO;""", engine
)

#creating a grouped bar chart for the queries
fig2 = go.Figure(
    data=[
        go.Bar(
            name="1979",
            x=query_CPI_1["GEO"],
            y=query_CPI_1["average_value"],
            offsetgroup=0,
        ),
        go.Bar(
            name="2002",
            x=query_CPI_2["GEO"],
            y=query_CPI_2["average_value"],
            offsetgroup=1,
        ),

        go.Bar(
            name="2023",
            x=query_CPI_3["GEO"],
            y=query_CPI_3["average_value"],
            offsetgroup=2,
        ),
    ],
    layout=go.Layout(
        title="Average Price of Alcohol By Province With 2002 as Baseline",
        yaxis_title="Average Alcohol Price(2002=100)"
    )
)
fig2.show()

The visualization displays the average price of alcohol for each province for the years 1979, 2002, and 2023. Since 2002=100 is our baseline, these bars don't hold much meaning except for reference points for other years. As indicated in the graph, the price of alcohol has stayed relatively consistent across all provinces, however, there are some minor differences. A study was conducted on the price of alcohol in Canada and concluded that the western provinces generally had higher prices than those in the east (Giesbrecht et al., 2015). This is reflected in the graph, whereby Saskatchewan has a higher average price when compared to other provinces in 2023. On the other hand, Ontario and Quebec have slightly lower prices, all when compared to 2002.

The next goal was to determine if there was a difference in prices of food across provinces for 2023. This is an especially interesting topic to investigate, as food prices have inflated outside of the range of affordability for many individuals.

In [30]:
query_CPI_4 = pd.read_sql_query("""
SELECT REF_DATE, GEO, product, avg(VALUE) as avg_price
FROM CPI_initial
WHERE REF_DATE LIKE '%2023%' AND product = 'Food purchased from stores'
GROUP BY GEO, product HAVING avg(VALUE) IS NOT NULL;""", engine)

query_CPI_5 = pd.read_sql_query("""
SELECT REF_DATE, GEO, product, avg(VALUE) as avg_price
FROM CPI_initial
WHERE REF_DATE LIKE '%2023%' AND product = 'Food purchased from restaurants'
GROUP BY GEO, product HAVING avg(VALUE) IS NOT NULL;""", engine)

fig = go.Figure(
    data=[
        go.Bar(
            name="Food Purchased From Stores",
            x=query_CPI_4["GEO"],
            y=query_CPI_4["avg_price"],
            offsetgroup=0,
        ),
        go.Bar(
            name="Food Purchased From Restaurants",
            x=query_CPI_5["GEO"],
            y=query_CPI_5["avg_price"],
            offsetgroup=1,
        ),
    ],
    layout=go.Layout(
        title="Cost of Food Across Provinces",
        yaxis_title="Average Price(2002=100)"
    )
)
fig.show()

The resulting visualization, again, shows consistency across provinces and between the 'food purchased' categories. Interestingly enough, for some provinces, the average price of eating at restaurants is often the cheaper option between the two categories. This was surprising as the opposite rhetoric is broadcasted in media. Since these results could be due to a multitude of factors, we can not be sure exactly what causes this outcome. However, some explanations may include the availability of ingredients, differences in tourism, etc. across provinces.

The next investigative query looks at the average price of housing from 1979 to 2024 in an attempt to see which aspects of housing have increased the most, as well as which years experienced the greatest increases in costs.

In [31]:
query_CPI_6 = pd.read_sql_query("""SELECT avg(VALUE) AS average_value, REF_DATE, product
FROM CPI_initial
WHERE product IN ('Rented accommodation', 'Owned accommodation', 'Water, fuel and electricity', 'Household operations', 'Household furnishings and equipment')
GROUP BY REF_DATE, product HAVING avg(VALUE) IS NOT NULL;""", engine)


fig = px.line(query_CPI_6, x= query_CPI_6['REF_DATE'], y= query_CPI_6['average_value'], color = 'product', title = 'Average Price of Housing From 1979-2024')
fig.show()

From this visualization, we can see that the price of water, fuel, and electricity has increased the most dramatically when compared to other housing costs. The most dramatic shift in the utility category occurred during 2020. This is likely due to an increase in extreme weather events, wildfires, and other government initiatives and price caps (Alleckna, 2024). This upward trend is reflected in the other categories to a lesser extent but may be due to increased demand for housing due to the increase in immigration after the pandemic ended in 2020.

The next goal was to determine if there were any significant trends in the cost of transportation in Alberta from 1979 to 2024. This becomes an increasingly important issue for Albertans as the price of transportation becomes more unaffordable.

In [32]:
query_CPI_7 = pd.read_sql_query("""
SELECT avg(VALUE) AS average_value, REF_DATE, GEO, product
FROM CPI_initial
WHERE GEO = 'Alberta' AND  product IN ('Private transportation', 'Gasoline', 'Public transportation')
GROUP BY GEO, REF_DATE, product
HAVING avg(VALUE) IS NOT NULL;""", engine)


fig = px.line(query_CPI_7, x= query_CPI_7['REF_DATE'], y= query_CPI_7['average_value'], color = 'product', title = 'Average Price of Transportation in Alberta From 1979 to 2024')
fig.show()

The price of transportation has more dramatic fluctuations as time goes on than other CPI products. This is especially visible in the gasoline category. Large variations in price are likely due to the pricing model of Gasoline-related products, which relies on supply and demand. As such, we see a severe drop in prices of gasoline around 2020, when the need for transportation decreased as restrictions increased. Following the lifting of restrictions, the price increases almost 200% since 2002.

The following bar charts were made in an attempt to understand the distribution of CPI prices across provinces, for all product categories. Please note that there should be six rows and four columns of graphs visible. The browser size can be adjusted to view all of the distributions, or the height and width can be adjusted in the code chunk.

In [33]:
query_CPI_7 = pd.read_sql_query("SELECT*FROM CPI_table_quarter",engine)
fig = make_subplots(rows=6, cols=4, subplot_titles=("Food Purchased From Stores",
                                                    "Food purchased from restaurants",
                                                    "Rented accommodation",
                                                    "Owned accommodation",
                                                    'Water, fuel and electricity',
                                                    'Household operations',
                                                    'Household furnishings and equipment',
                                                    'Clothing',
                                                    'Footwear',
                                                    'Clothing accessories, watches and jewellery',
                                                    'Clothing material, notions and services',
                                                    'Public transportation',
                                                    'Gasoline',
                                                    'Private transportation',
                                                    'Health care',
                                                    'Personal care',
                                                    'Recreation',
                                                    'Education and reading' ,
                                                    'Alcoholic beverages',
                                                    'Tobacco products and smokers supplies',
                                                    'Recreational cannabis',
                                                    'Goods',
                                                    'Services'
                                                   ))


fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Food_purchased_from_stores"], name="Food_purchased_from_stores"),
   row=1, col=1
)

fig.update_yaxes(title_text="Health Care Costs", row=1, col=1)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Food__purchased_from_restaurants"], name="Food__purchased_from_restaurants"),
    row=1, col=2
)

fig.update_yaxes(title_text="Health Care Costs", row=1, col=2)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Rented_accommodation"], name="Rented_accommodation"),
    row=1, col=3
)

fig.update_yaxes(title_text="Health Care Costs", row=1, col=3)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Owned_accommodation"], name="Owned_accommodation"),
    row=1, col=4
)

fig.update_yaxes(title_text="Health Care Costs", row=1, col=4)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Water_fuel_and_electricity"], name="Water_fuel_and_electricity"),
    row=2, col=1
)

fig.update_yaxes(title_text="Health Care Costs", row=2, col=1)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7[ "Household_operations"], name= "Household_operations"),
    row=2, col=2
)

fig.update_yaxes(title_text="Health Care Costs", row=2, col=2)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Household_furnishings_and_equipment"], name="Household_furnishings_and_equipment"),
    row=2, col=3
)

fig.update_yaxes(title_text="Health Care Costs", row=2, col=3)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Clothing"], name="Clothing"),
    row=2, col=4
)

fig.update_yaxes(title_text="Health Care Costs", row=2, col=4)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Footwear"], name="Footwear"),
    row=3, col=1
)

fig.update_yaxes(title_text="Health Care Costs", row=3, col=1)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Clothing_accessories_watches_jewellery"], name="Clothing_accessories_watches_jewellery"),
    row=3, col=2
)


fig.update_yaxes(title_text="Health Care Costs", row=3, col=2)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Clothing_material_notions_services"], name="Clothing_material_notions_services"),
    row=3, col=3
)

fig.update_yaxes(title_text="Health Care Costs", row=3, col=3)


fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Public_transportation"], name="Public_transportation"),
    row=3, col=4
)

fig.update_yaxes(title_text="Health Care Costs", row=3, col=4)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Gasoline"], name="Gasoline"),
    row=4, col=1
)


fig.update_yaxes(title_text="Health Care Costs", row=4, col=1)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Private_transportation"], name="Private_transportation"),
    row=4, col=2
)


fig.update_yaxes(title_text="Health Care Costs", row=4, col=2)


fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7[ "Health_care"], name= "Health_care"),
    row=4, col=3
)

fig.update_yaxes(title_text="Health Care Costs", row=4, col=3)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Personal_care"], name= "Personal_care"),
    row=4, col=4
)

fig.update_yaxes(title_text="Health Care Costs", row=4, col=4)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Recreation"], name= "Recreation"),
    row=5, col=1
)

fig.update_yaxes(title_text="Health Care Costs", row=5, col=1)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7[ "Education_and_reading"], name=  "Education_and_reading"),
    row=5, col=2
)

fig.update_yaxes(title_text="Health Care Costs", row=5, col=2)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Alcoholic_beverages"], name="Alcoholic_beverages"),
    row=5, col=3
)

fig.update_yaxes(title_text="Health Care Costs", row=5, col=3)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Tobacco_smokers_supplies"], name="Tobacco_smokers_supplies"),
    row=5, col=4
)

fig.update_yaxes(title_text="Health Care Costs", row=5, col=4)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Recreational_cannabis"], name="Recreational_cannabis"),
    row=6, col=1
)

fig.update_yaxes(title_text="Health Care Costs", row=6, col=1)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Goods"], name="Goods"),
    row=6, col=2
)

fig.update_yaxes(title_text="Health Care Costs", row=6, col=2)

fig.add_trace(
    go.Box(x=query_CPI_7["Province"], y=query_CPI_7["Services"], name="Services"),
    row=6, col=3
)

fig.update_xaxes(title_text="Health Care Costs", row=6, col=3)

fig.update_layout(title="CPI Comparison by Product From 1978-2024 Across All Provinces",showlegend=False, height = 3000, width = 1500)

fig.show()

Finally, a Canada wide query was created to look at trends in health care prices spanning 1978 to 2024.

In [34]:
query_CPI_8 = pd.read_sql_query("""
    SELECT
        Year,
        Province,
        AVG(Health_care) AS Health_care
    FROM
        CPI_table_quarter
    WHERE
        Year BETWEEN "1978-09-01" AND "2024-01-01"
    GROUP BY
        Province, Year
""", engine)

fig = px.line(query_CPI_8, x= query_CPI_8['Year'], y= query_CPI_8['Health_care'], color = 'Province', title = 'Average Price of Health Care By Province and Year For 1978-2024')
fig.show()

This query resulted in the realization that Alberta stood out as the most expensive province for health care in relation to prices in 2002. Due to this, the analysis of the CPI and provisional death counts will extend only to Alberta. This will provide the most distinct results but is also relevant to the class as a whole.

This individual analysis provided the group with information about how to best approach joins using CPI. More specifically, the nuanced relationship between price of goods and services, current events, and political policies are visible, suggesting key areas to focus on for further exploration.

Monthly Labour Force Characteristics (Noah)¶

The Labour force characteristics dataset is available from Statistics Canada (Government of Canada, 2018) through their Open Government License (Secretariat & Open Government, n.d.). This dataset contains information on labour statistics such as population and employment rates in canada over time. I chose this dataset because I thought it would be interesting to see how employment affects other aspects of our lives. Given that working is a major part of most peoples lives, I thought that this would be an interesting opprotunity to investigate how this affects different aspects from our other datasets, such as healthcare, opiate use, or cost of goods.

The dataset contains 7 dimensions: time, geography, labour characteristic, sex, age group, statistic, and data type. The time dimension spans the months from 1977 to 2024. The rest of the dimensions can be seen in the table below:

Geography | Labour force characteristics | Sex | Age group | Statistics | Data type --------------------------|------------------------------|------------|-----------|-----------------------------------------|---------- Newfoundland and labrador | Labour Force (thousands) | Males | 15+ | Estimate | Seasonally adjusted Price Edward Island | Employment (thousands) | Females | 15-24 | Standard error of estimate | Unadjusted Nova Scotia | Unemployment (thousands) | Both Sexes | 15-19 | Standard error of month-to-month change | Trend-cycle (last 5 months) New Brunswick | Full-time Employment (thousands) | | 20-24 | Standard error of year-over-year change Quebec | Part-time Employment (thousands) | | 25+ | Ontario | Population (thousands) | | 25-54 | Manitoba | Unemployment rate (percent) | | 55+ | Saskatchewan | Participation rate (percent) | | 15-64 | Alberta | Employment rate (percent) | | 55-64 | British Columbia | | | | Canada (Total) | | | |

Cleaning¶

This dataset contains over 5 million rows across 19 columns, and thus it is quite large. We clean and process the data in the cell below. The first function call processes the full csv file into one large table in the SQL database, and the second one runs an SQL query to pivot the full table on the "Labour force characteristics" column, meaning each entry in the above table will become its own column. These two functions are defined at the top of the notebook in the setup cells.

In [35]:
# set True to drop and re-upload the data & pivot table to the SQL database
reuploadLABOUR = True

if reuploadLABOUR:
    uploadLABOURdatabychunk(filename=LABOURPATH, tablename = "labour_initial")
    upload_labour_pivot("labour_initial", "labour_table")
table "labour_initial" dropped (or doesnt exist)
Starting upload chunk: 1
Finished upload chunk: 1
Starting upload chunk: 2
Finished upload chunk: 2
Starting upload chunk: 3
Finished upload chunk: 3
Starting upload chunk: 4
Finished upload chunk: 4
Starting upload chunk: 5
Finished upload chunk: 5
Starting upload chunk: 6
Finished upload chunk: 6
All 6 chunks uploaded to table 'labour_initial'

In the first function, we take a chunk of the CSV file, drop the repeated or redundant columns. For example, we originally have columns DGUID and GEO, which both contain information about the geography, one by name and one by code number, we drop the code number as it is repeat information. We also drop things like SYMBOL which is null for every row. After dropping unneeded columns, we rename each remaining column to have no spaces to make SQL queries easier, and append this chunk to the sql table. This is repeated until no chunks remain and the entire CSV has been read.

With the full CSV file in SQL table format, we can pivot it. In this pivot table, we want a single unique row for each province and month combination to more readily compare across datasets. We want to retain information about the various labour characteristics, so we can make each distinct value its own column. To this end, we need to select a single value for all other dimensions so we do not get duplicate rows. eg, in Alberta in 2010, there is an unemployment rate for males and for females and for both, but we only want one row for Alberta2010, not three. We decided to keep the most general subset, which is both sexes, ages 15+, and Unadjusted Estimate. With each row now being unique, we also additionally create columns to use as a primary key when joining with other datasets, namely a column of year month and region (ex 1976-01:Alberta), and another column of region year quarter (ex Alberta1976Q1).

Data Exploration¶

The head of the pivoted dataset can be seen below.

In [36]:
# pivot table head
display(
    pdsql("SELECT * FROM labour_table LIMIT 5")
)
dateregion date region Population_thousands LabourForce_thousands Employment_thousands Fulltime_thousands Parttime_thousands Unemployment_thousands Unemployment_percent Participation_percent Employment_percent quarter
0 1976-01:Alberta 1976-01-01 Alberta 1276.7 832.0 787.8 653.3 134.5 44.2 5.3 65.2 61.7 Alberta1976Q1
1 1976-01:British Columbia 1976-01-01 British Columbia 1852.4 1099.6 988.4 835.8 152.5 111.2 10.1 59.4 53.4 British Columbia1976Q1
2 1976-01:Canada 1976-01-01 Canada 16852.4 10087.1 9271.4 8007.4 1264.0 815.7 8.1 59.9 55.0 Canada1976Q1
3 1976-01:Manitoba 1976-01-01 Manitoba 732.9 441.4 411.8 348.9 63.0 29.6 6.7 60.2 56.2 Manitoba1976Q1
4 1976-01:New Brunswick 1976-01-01 New Brunswick 476.8 244.1 214.1 185.5 28.6 30.0 12.3 51.2 44.9 New Brunswick1976Q1

We can now use queries to investigate this dataset. For example, we can query to find the average unemployment rate for all provinces across the past 47 years. The query below simply groups by province, and calculates the average of the unemployment rate for each group.

In [37]:
# SQL Query -> average unemployment over the last 47 years
labouravg = pdsql("""
SELECT
    region,
    AVG(value) as AvgUnemployment
FROM labour_initial
WHERE
    statistic = 'Estimate'
    AND datatype = 'Unadjusted'
    AND sex = 'Both sexes'
    AND agegroup = '15 years and over'
    AND labourcharacteristics = 'Unemployment rate'
GROUP BY region
ORDER BY AvgUnemployment
""")
In [38]:
display(labouravg)
region AvgUnemployment
0 Saskatchewan 5.892479
1 Manitoba 6.293504
2 Alberta 6.559487
3 Ontario 7.395214
4 Canada 8.066154
5 British Columbia 8.085641
6 Quebec 9.281880
7 Nova Scotia 10.060513
8 New Brunswick 10.900342
9 Prince Edward Island 11.999658
10 Newfoundland and Labrador 15.589573
In [39]:
# plot average unemployment over the last 47 years
fig, ax = choroplot(
    dataframe = labouravg,
    data_whichcolumn = 'AvgUnemployment',
    prov_filename = GEOJSONPATH,
    data_regionnamecolumn = 'region',
    plotsize = (10,8),
    plottitlename = 'Unemployment Across Canada (1977-2024 Average)',
    plotlegendname = 'Unemployment (%)',
    plotlegendorientation = 'vertical',
    datacolormap = 'OrRd',
    vmin = 0,
    missingcolor = 'lightgrey',
    legendon = True,
    boundaryon = True,
    axisoff = True)

fig.show()
No description has been provided for this image

Here we can see that on average, Saskatchewan has the lowest unemployment rate, while Newfoundland and Labrador has the highest. This could be due to a larger number of people retiring to the east coast, or perhaps people who choose to move to the praries (or stay living in) mainly do so because of a job.

We can also find out which year had the best average employment rate in each province. The query below first groups by all combinations of region and year, and then uses a window function to rank these rows by average employment rate to find the best one in each province.

In [40]:
# SQL Query -> best year for employment in each province
bestyearlabour = pdsql("""
SELECT a.region AS Region, a.year AS Year, a.yearavg AS AverageEmploymentRate
FROM (SELECT b.region, b.yearavg, b.year, b.labourcharacteristics, ROW_NUMBER() OVER (PARTITION BY b.region ORDER BY b.yearavg DESC) ranked_order
                FROM (
                        SELECT region, AVG(value) as yearavg, SUBSTRING(date, 1, 4) AS year, labourcharacteristics
                        FROM labour_initial
                        WHERE labourcharacteristics = 'employment rate'
                                AND sex = 'both sexes'
                                AND agegroup = '15 years and over'
                                AND statistic = 'estimate'
                                AND datatype = 'unadjusted'
                        GROUP BY region,year) as b
    ) AS a
WHERE a.ranked_order = 1
ORDER BY a.yearavg
""")
In [41]:
display(bestyearlabour)
Region Year AverageEmploymentRate
0 Newfoundland and Labrador 2013 54.416667
1 Nova Scotia 2008 58.858333
2 New Brunswick 2008 59.341667
3 Prince Edward Island 2013 61.983333
4 Quebec 2023 62.066667
5 British Columbia 2007 63.308333
6 Canada 2008 63.341667
7 Manitoba 2008 66.416667
8 Ontario 1989 66.458333
9 Saskatchewan 2013 67.358333
10 Alberta 2008 71.916667

From this we can see that Alberta's best year was 2008 with an employment rate of just under 72% on average, which was also the best out of all provinces. This may be due to the financial crisis in 2008, where it seems Alberta had more jobs available or perhaps cheaper housing (and thus people moving to Alberta in greater numbers).

We can also investigate how the employment rate gender gap for adults (25+) has changed over the decades in canada as a whole. The query below selects the employment rate for canada as a whole, groups by decade, and then splits by the value of the sex column (male, female, both sexes).

In [42]:
# SQL Query -> Employment rate by gender, by decade
genderdecadelabour = pdsql("""
SELECT
    CONCAT(SUBSTRING(date, 1, 3), "", "0") AS decade,
    AVG(CASE WHEN sex = 'Males' THEN value END) as AvgMale,
    AVG(CASE WHEN sex = 'Females' THEN value END) as AvgFemale,
    AVG(CASE WHEN sex = 'Both Sexes' THEN value END) as AvgBoth,
    AVG(CASE WHEN sex = 'Males' THEN value END) - AVG(CASE WHEN sex = 'Females' THEN value END) AS Difference
FROM labour_initial
WHERE region = 'Canada'
    AND labourcharacteristics = 'Employment rate'
    AND agegroup = '25 years and over'
    AND statistic = 'estimate'
    AND datatype = 'unadjusted'
GROUP BY decade
ORDER BY Difference DESC
""")
In [43]:
# Plot: Employment rates by gender by decade
multi_line_plot(
    df = genderdecadelabour,
    columntuples = [('decade', 'AvgMale'), ('decade', 'AvgFemale'), ('decade', 'AvgBoth')],
    serieslabels = ['Average Male', 'Average Female', 'Average Both'],
    xlabel = 'Decade',
    ylabel = 'Employment Rate (%)',
    title = 'Gender Employment Gap in Canada by Decade',
    figsize = (1100, 500),
    titlesize = 30
)

Here we can see that the employment rate gap of male over female adults (25+) in canada total was over 35% back in the 1970's and has since decreased to around 9%. Interestingly enough, the male employment rate has decreased (or stayed about the same) every decade, while the female employment rate has increased (or stayed roughly the same). The increase in female employment is likely due to an increase in gender equality and labour laws over the years, while the decrease in male employment reflects the same thing (because the jobs have to come from somewhere, and the overall employment rate stays roughly steady around 60%).

The final investigation we did into the labour data alone is looking at the participation rate due to the covid 19 pandemic. The participation rate is the percentage of people who are capable of working, who are either employed or looking for work. The query below finds the provinces with above average decrease in participation from 2019 to 2020.

In [44]:
# SQL Query -> above average participation dropoff 2019-2020, by province
labourcovid = pdsql("""
SELECT *
    FROM (
        SELECT
            region,
            AVG(CASE WHEN date LIKE '2019%' THEN value END) AS avg2019,
            AVG(CASE WHEN date LIKE '2020%' THEN value END) AS avg2020,
            AVG(CASE WHEN date LIKE '2019%' THEN value END) - AVG(CASE WHEN date LIKE '2020%' THEN value END) as diff
        FROM labour_initial
        WHERE labourcharacteristics = 'participation rate'
            AND sex = 'both sexes'
            AND agegroup = '15 years and over'
            AND statistic = 'estimate'
            AND datatype = 'unadjusted'
        GROUP BY region ) as t1
WHERE t1.diff > (SELECT AVG(diff) FROM (
            SELECT
                region,
                AVG(CASE WHEN date LIKE '2019%' THEN value END) - AVG(CASE WHEN date LIKE '2020%' THEN value END) as diff
            FROM labour_initial
            WHERE labourcharacteristics = 'participation rate'
                AND sex = 'both sexes'
                AND agegroup = '15 years and over'
                AND statistic = 'estimate'
                AND datatype = 'unadjusted'
            GROUP BY region) AS t2
        )
ORDER BY diff DESC;
""")
In [45]:
display(labourcovid)
region avg2019 avg2020 diff
0 Alberta 71.700000 68.941667 2.758333
1 Newfoundland and Labrador 58.741667 56.458333 2.283333
2 Prince Edward Island 67.008333 64.808333 2.200000
3 British Columbia 66.333333 64.216667 2.116667
4 Nova Scotia 62.258333 60.425000 1.833333
5 Saskatchewan 69.091667 67.275000 1.816667

From the output, we can see that the largest dropoff in participation due to covid was in Alberta at 2.75% decrease between 2019 and 2020. This is likely due to the more relaxed restrictions in the province due to the pandemic, resulting in a larger outbreak, and more jobs having to shutdown subsuquently as a result.

Provisional Weekly Death Counts (Bobbi)¶

The first step of my analysis is to work on cleaning and exploring the weekly death count dataset. This dataset is provided by Statistics Canada and can be found by following the link: Weekly Death Count. This dataset contains 108,823 rows and 15 columns. It spans from 2010 - 2024 and provides a weekly death count based on 11 different causes of death.

In [46]:
# read csv to a dataframe
deaths_df = pd.read_csv(DEATHSPATH)
deaths_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108823 entries, 0 to 108822
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   REF_DATE         108823 non-null  object 
 1   GEO              108823 non-null  object 
 2   DGUID            108823 non-null  object 
 3   Cause of death   108823 non-null  object 
 4   Characteristics  108823 non-null  object 
 5   UOM              108823 non-null  object 
 6   UOM_ID           108823 non-null  int64  
 7   SCALAR_FACTOR    108823 non-null  object 
 8   SCALAR_ID        108823 non-null  int64  
 9   VECTOR           108823 non-null  object 
 10  COORDINATE       108823 non-null  object 
 11  VALUE            97850 non-null   float64
 12  STATUS           10973 non-null   object 
 13  SYMBOL           0 non-null       float64
 14  TERMINATED       0 non-null       float64
 15  DECIMALS         108823 non-null  int64  
dtypes: float64(3), int64(3), object(10)
memory usage: 13.3+ MB
In [47]:
# output the first 5 rows
deaths_df.head()
Out[47]:
REF_DATE GEO DGUID Cause of death Characteristics UOM UOM_ID SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS
0 2010-01-09 Newfoundland and Labrador, place of occurrence 2016A000210 Malignant neoplasms [C00-C97] Number of deaths Number 223 units 0 v1234858198 2.2.1 25.0 NaN NaN NaN 0
1 2010-01-16 Newfoundland and Labrador, place of occurrence 2016A000210 Malignant neoplasms [C00-C97] Number of deaths Number 223 units 0 v1234858198 2.2.1 35.0 NaN NaN NaN 0
2 2010-01-23 Newfoundland and Labrador, place of occurrence 2016A000210 Malignant neoplasms [C00-C97] Number of deaths Number 223 units 0 v1234858198 2.2.1 20.0 NaN NaN NaN 0
3 2010-01-30 Newfoundland and Labrador, place of occurrence 2016A000210 Malignant neoplasms [C00-C97] Number of deaths Number 223 units 0 v1234858198 2.2.1 20.0 NaN NaN NaN 0
4 2010-02-06 Newfoundland and Labrador, place of occurrence 2016A000210 Malignant neoplasms [C00-C97] Number of deaths Number 223 units 0 v1234858198 2.2.1 25.0 NaN NaN NaN 0

Cleaning the Data Frame¶

I decided that it makes the most sense to clean the dataset in pandas and create queries in SQL. I started by dropping columns that the group does not need for the project. This leaves the reference date, geography, cause of death and value. I renamed the columns so they will correspond to the other three datasets we will be using, specifically region and date. I turned the date into a datetime object, and stripped region and cause of death of any useless string characters. The next step was to create a unique key to be able to join datasets in order to answer our guiding questions. To do this, I created two new columns, Monthly and Quarterly. The monthly column contains the month and year and the quarterly column contains the quarter and the year. Each of these columns are then merged with the region column to create two new columns, RegionYearQuarter and YearMonthRegion. These columns were created to be exactly the same as columns created in the other datasets by all other group members. Some datasets will be joined by month and others will be joined by quarter. I created two seperate tables to make the joining process easier. The quarterly dataframe takes the RegionYearQuarter, Cause of Death and sums the number of deaths per quarter. The monthly dataframe takes the YearMonthRegion, Cause of Death and sums the number of deaths per month. Both of the tables are pivoted to make joining them easier. Lastly, the tables are uploaded into SQL.

In [48]:
deaths_df.drop(columns = ['DGUID', 'Characteristics','UOM','UOM_ID','SCALAR_FACTOR',
                           'SCALAR_FACTOR','SCALAR_ID','VECTOR','COORDINATE','STATUS',
                           'SYMBOL','TERMINATED','DECIMALS'], inplace = True)
In [49]:
deaths_df.rename(columns = {'REF_DATE': 'Date',
                            'GEO': 'Region',
                            'Cause of death' : 'Cause_of_Death',
                            'VALUE' : 'Number_of_Deaths'}, inplace=True)
In [50]:
# add cloumn and change Date from object to datetime and change to quarterly
deaths_df['Quarterly'] = pd.to_datetime(deaths_df['Date']).dt.to_period('Q').astype(str)

# change Date from object to datetime and change to monthly
deaths_df['Monthly'] = pd.to_datetime(deaths_df['Date']).dt.to_period('M').astype(str)

# strip Province of ", place of occurrence" and excess whitespace
deaths_df['Region'] = deaths_df['Region'].str.replace(', place of occurrence', '', regex=True).str.strip()

# strip Cause_of_Death of "[]" and excess whitespace
deaths_df['Cause_of_Death'] = deaths_df['Cause_of_Death'].str.replace('\[.*?\]', '', regex=True).str.strip()

# add column to create unique key for quarterly
deaths_df['RegionYearQuarter'] = deaths_df['Region'].astype(str) + deaths_df['Quarterly'].astype(str)

# add column to create unique key for monthly
deaths_df['YearMonthRegion'] = deaths_df['Monthly'].astype(str) + str(":") + deaths_df['Region'].astype(str)
In [51]:
# import data into SQL dataframe, if it already exists replace it
# import in chunks to aviod crashing
deaths_df.to_sql('deaths', engine, if_exists='replace', index=None,
                 index_label=None, chunksize=1000, dtype=None, method=None)
Out[51]:
108823
In [52]:
# drop date column
deaths_df.drop(columns=['Date'], inplace=True)

deaths_df.head()
Out[52]:
Region Cause_of_Death Number_of_Deaths Quarterly Monthly RegionYearQuarter YearMonthRegion
0 Newfoundland and Labrador Malignant neoplasms 25.0 2010Q1 2010-01 Newfoundland and Labrador2010Q1 2010-01:Newfoundland and Labrador
1 Newfoundland and Labrador Malignant neoplasms 35.0 2010Q1 2010-01 Newfoundland and Labrador2010Q1 2010-01:Newfoundland and Labrador
2 Newfoundland and Labrador Malignant neoplasms 20.0 2010Q1 2010-01 Newfoundland and Labrador2010Q1 2010-01:Newfoundland and Labrador
3 Newfoundland and Labrador Malignant neoplasms 20.0 2010Q1 2010-01 Newfoundland and Labrador2010Q1 2010-01:Newfoundland and Labrador
4 Newfoundland and Labrador Malignant neoplasms 25.0 2010Q1 2010-02 Newfoundland and Labrador2010Q1 2010-02:Newfoundland and Labrador
In [53]:
quarterly_df = deaths_df.groupby(['RegionYearQuarter','Cause_of_Death'])['Number_of_Deaths'].sum().reset_index()
quarterly_df.head()
Out[53]:
RegionYearQuarter Cause_of_Death Number_of_Deaths
0 Alberta2010Q1 Accidents (unintentional injuries) 155.0
1 Alberta2010Q1 Alzheimer's disease 110.0
2 Alberta2010Q1 COVID-19 0.0
3 Alberta2010Q1 Cerebrovascular diseases 240.0
4 Alberta2010Q1 Chronic lower respiratory diseases 220.0
In [54]:
death_table_quarter = quarterly_df.pivot(index='RegionYearQuarter', columns='Cause_of_Death', values='Number_of_Deaths')

death_table_quarter.rename(columns = {
                                "Alzheimer's disease": "Alzheimer's_Disease",
                                "Cerebrovascular diseases" : "Cerebrovascular_Diseases",
                                "Chronic lower respiratory diseases" : "Chronic_Lower_Respiratory_Diseases",
                                "Malignant neoplasms":"Malignant_Neoplasms",
                                "Diabetes mellitus":"Diabetes_Mellitus",
                                "Diseases of heart":"Diseases_of_Heart",
                                "Influenza and pneumonia":"Influenza_and_Pneumonia",
                                "Intentional self-harm (suicide)":"Suicide",
                                "Nephritis, nephrotic syndrome and nephrosis":"Nephritis,_Nephrotic_Syndrome_and_Nephrosis",
                                "Accidents (unintentional injuries)": "Accidents_(Unintentional_Injuries)"},inplace=True)

death_table_quarter.head()
Out[54]:
Cause_of_Death Accidents_(Unintentional_Injuries) Alzheimer's_Disease COVID-19 Cerebrovascular_Diseases Chronic_Lower_Respiratory_Diseases Diabetes_Mellitus Diseases_of_Heart Influenza_and_Pneumonia Suicide Malignant_Neoplasms Nephritis,_Nephrotic_Syndrome_and_Nephrosis
RegionYearQuarter
Alberta2010Q1 155.0 110.0 0.0 240.0 220.0 105.0 1185.0 70.0 105.0 1310.0 80.0
Alberta2010Q2 195.0 90.0 0.0 280.0 240.0 85.0 1185.0 100.0 135.0 1340.0 75.0
Alberta2010Q3 240.0 80.0 0.0 265.0 195.0 80.0 1080.0 90.0 170.0 1465.0 60.0
Alberta2010Q4 270.0 95.0 0.0 280.0 220.0 95.0 1230.0 105.0 130.0 1400.0 75.0
Alberta2011Q1 195.0 110.0 0.0 285.0 325.0 135.0 1355.0 120.0 115.0 1475.0 70.0
In [55]:
monthly_df = deaths_df.groupby(['YearMonthRegion','Cause_of_Death'])['Number_of_Deaths'].sum().reset_index()
monthly_df.head()
Out[55]:
YearMonthRegion Cause_of_Death Number_of_Deaths
0 2010-01:Alberta Accidents (unintentional injuries) 55.0
1 2010-01:Alberta Alzheimer's disease 35.0
2 2010-01:Alberta COVID-19 0.0
3 2010-01:Alberta Cerebrovascular diseases 90.0
4 2010-01:Alberta Chronic lower respiratory diseases 60.0
In [56]:
death_table_month = monthly_df.pivot(index='YearMonthRegion', columns='Cause_of_Death', values='Number_of_Deaths')

death_table_month.rename(columns = {
                                "Alzheimers disease": "Alzheimer's_Disease",
                                "Cerebrovascular diseases" : "Cerebrovascular_Diseases",
                                "Chronic lower respiratory diseases" : "Chronic_Lower_Respiratory_Diseases",
                                "Malignant neoplasms":"Malignant_Neoplasms",
                                "Diabetes mellitus":"Diabetes_Mellitus",
                                "Diseases of heart":"Diseases_of_Heart",
                                "Influenza and pneumonia":"Influenza_and_Pneumonia",
                                "Intentional self-harm (suicide)":"Suicide",
                                "Nephritis, nephrotic syndrome and nephrosis":"Nephritis,_Nephrotic_Syndrome_and_Nephrosis",
                                "Accidents (unintentional injuries)": "Accidents_(Unintentional_Injuries)"},inplace=True)

death_table_month.head()
Out[56]:
Cause_of_Death Accidents_(Unintentional_Injuries) Alzheimer's disease COVID-19 Cerebrovascular_Diseases Chronic_Lower_Respiratory_Diseases Diabetes_Mellitus Diseases_of_Heart Influenza_and_Pneumonia Suicide Malignant_Neoplasms Nephritis,_Nephrotic_Syndrome_and_Nephrosis
YearMonthRegion
2010-01:Alberta 55.0 35.0 0.0 90.0 60.0 35.0 400.0 25.0 35.0 430.0 25.0
2010-01:British Columbia 115.0 50.0 0.0 170.0 120.0 85.0 500.0 110.0 35.0 710.0 35.0
2010-01:Manitoba 50.0 15.0 0.0 40.0 35.0 30.0 185.0 15.0 5.0 205.0 20.0
2010-01:New Brunswick 20.0 5.0 0.0 20.0 15.0 15.0 95.0 15.0 10.0 145.0 15.0
2010-01:Newfoundland and Labrador 20.0 15.0 0.0 30.0 15.0 10.0 65.0 5.0 5.0 100.0 0.0
In [57]:
# upload table into SQL
death_table_quarter.to_sql("death_table_quarter", engine, if_exists = "replace")
Out[57]:
767
In [58]:
# upload table into SQL
death_table_month.to_sql("death_table_month", engine, if_exists = "replace")
Out[58]:
2288

Exploring the Data¶

It is important to explore the data individually before joining it with other data sets. "Data preparation also involves finding relevant data to ensure that analytics applications deliver meaningful information and actionable insights for business decision-making"(Stedman, 2024). I will begin my analysis by looking at the averages, minimums, and maximums of the number of deaths for each cause of death.

In [59]:
# create query to find max, min, average deaths for each cause per week
summary_query = pd.read_sql_query("""SELECT Cause_of_Death, AVG(Number_of_Deaths),
                                MAX(Number_of_Deaths), MIN(Number_of_Deaths)
                                FROM deaths GROUP BY Cause_of_Death""", engine)
print(summary_query)
                                 Cause_of_Death  AVG(Number_of_Deaths)  \
0            Accidents (unintentional injuries)              21.973255   
1                           Alzheimer's disease               9.289776   
2                      Cerebrovascular diseases              20.627567   
3            Chronic lower respiratory diseases              18.336843   
4                                      COVID-19              20.692708   
5                             Diabetes mellitus              10.770770   
6                             Diseases of heart              79.650416   
7                       Influenza and pneumonia               9.714647   
8               Intentional self-harm (suicide)               6.226703   
9                           Malignant neoplasms             118.979151   
10  Nephritis, nephrotic syndrome and nephrosis               5.419606   

    MAX(Number_of_Deaths)  MIN(Number_of_Deaths)  
0                   315.0                    0.0  
1                    85.0                    0.0  
2                   135.0                    0.0  
3                   140.0                    0.0  
4                   850.0                    0.0  
5                    95.0                    0.0  
6                   565.0                    0.0  
7                   220.0                    0.0  
8                    45.0                    0.0  
9                   670.0                    0.0  
10                   45.0                    0.0  

Finding the average, maximum, and minimum can be useful to get an idea of the spread of the data. To create this query I selected the cause of death, average number of deaths, maximum number of deaths, and minimum number of deaths from the dataframe "deaths" and then grouped them by causes of death. This information can be displayed using a box plot.

In [60]:
# create query to find cause of death and number of deaths per week
deaths_query = pd.read_sql_query("""SELECT Cause_of_Death, Number_of_Deaths
                                    FROM deaths""", engine)
In [61]:
fig = px.box(deaths_query,
             x='Cause_of_Death',
             y='Number_of_Deaths',
             title='Number of Deaths for each Cause of Death per Week in Canada from 2010-2024',
             labels={'Cause_of_Death': 'Cause of Death', 'Number_of_Deaths': 'Number of Deaths'},
             width = 1000,
             height = 700)
fig.show()

This query was created by selecting the cause of death and number of deaths from the dataframe "deaths." This query is extremely useful to gather the information needed to create a box plot. Similar to the average, maximum, and minimum, the box plot shows the spread of the data. It is nice to be able to see the quantiles, and the outliers. Suicide has little outliers compared to the other causes of death. This tells us that the average number of deaths per week is quite consistent. Whereas COVID-19 has many outliers due to its inconsistency. This makes sense because when COVID-19 was first introduced into Canada it was highly contagious and took many lives. Since the creation of vaccines the number of deaths per week has decreased.

Next we will look at the total number of deaths for each cause of death in Alberta in 2020.

In [62]:
# create query to find total number of deaths for each cause in Alberta in 2020
total_query = pd.read_sql_query("""SELECT Cause_of_Death,
            SUM(Number_of_Deaths) FROM deaths WHERE Region = 'Alberta'
            AND DATE_FORMAT(Date, '%Y') = '2020' GROUP BY Cause_of_Death
            ORDER BY SUM(Number_of_Deaths) DESC""", engine)
In [63]:
fig = px.bar(total_query,
                    x='Cause_of_Death',
                    y='SUM(Number_of_Deaths)',
                    title='Total Number of Deaths for each Cause of Death in Alberta in 2020',
                    labels={'Cause_of_Death': 'Cause of Death', 'SUM(Number_of_Deaths)': 'Total Number of Deaths'},
                    width = 1000,
                    height = 700)
fig.show()

This query was created by selecting cause of death and the sum of number of deaths from the dataframe "deaths" where the region is Alberta and the year is 2020. Lastly these are grouped by cause of death, and put in descending order of the total number of deaths.

The bar graph above shows the total number of deaths from each cause of death in Alberta in 2020. I chose to look at the year 2020 because I was curious to see if COVID-19 was one of the top causes of death. That doesn't seem to be the case, and is actually the 6th cause of death in Alberta in 2020. (That is in comparison to only 10 other causes.)

From here, I wanted to find the leading cause of death in Canada for each year from 2010-2024.

In [64]:
# create query to find the total number of deaths for each cause for each year
causes_query = pd.read_sql_query("""SELECT Cause_of_Death,
            SUM(Number_of_Deaths), DATE_FORMAT(Date, '%Y') AS Year
            FROM deaths GROUP BY Cause_of_Death, Year""", engine)
In [65]:
fig = px.line(causes_query,
                    x='Year',
                    y='SUM(Number_of_Deaths)',
                    color='Cause_of_Death',
                    title='Number of Deaths from Each Cause per Year in Canada from 2010-2024',
                    labels={'Year': 'Year', 'SUM(Number_of_Deaths)': 'Total Number of Deaths'},
                    width = 1300,
                    height = 700)
fig.show()

This query was created by selecting cause of death, the sum of number of deaths, and the year from the dataframe "deaths" and are grouped by cause of death, and year. This information is used to create a line graph.

This graph gives us some insight into what the number one cause of death in Canada has been from 2010-2024. Malignant neoplasms seem to consistently be the leading cause of death in Canada. (Based on only 10 other causes of death.) Another interesting insight we gain from this graph, is the apparent drop in number of deaths for all causes of death in 2024. The main reason for this drop is most likely attributed to the fact this data was pulled halfway through 2024 and therefore the data is not fully complete.

The last graph I wanted to explore shows the number of suicide related deaths in each province in the year 2020.

In [66]:
# create query to find the total number of deaths caused by suicide in Canada in 2020
suicide_query = pd.read_sql_query("""SELECT Region, SUM(Number_of_Deaths)
            FROM deaths WHERE Cause_of_Death = 'Intentional self-harm (suicide)'
            AND DATE_FORMAT(Date, '%Y') = '2020' GROUP BY Region""", engine)
In [67]:
fig = px.bar(suicide_query,
                    x='Region',
                    y='SUM(Number_of_Deaths)',
                    title='Total Number of Deaths Caused by Suicide in Canada in 2020',
                    labels={'Region': 'Region', 'SUM(Number_of_Deaths)': 'Total Number of Deaths'},
                    width = 1000,
                    height = 700)
fig.show()

This query was created by selecting region and the sum of number of deaths from the dataframe "deaths" where the cause of death is suicide and the year is 2020 and then are grouped by the year. This information is used to create the bargraph.

This graph shows the number of deaths caused by suicide in Alberta in 2020. This graph doesnt tell us much in regards to which province has the most deaths caused by suicide because the population density is different for each region. When discussing such a difficult topic, one way to make a meaningful impact on the population when sharing statistics is to give numbers. For example, is Ontario in 2020 the number of suicide related deaths was 1,395. Hearing a large number like that can open people's eyes and get them involved in suicide prevention programs.

This concludes the analysis into the Provisional Weekly Death Counts dataset.

Data exploration¶

As a team, create a set of queries which draw data from the datasets chosen by multiple team members. The dataset of each team member should be used in conjunction with one or more datasets from more than one team member. The queries should tell you something about your datasets together, which queried alone, would not be obvious or possible to extract from your dataset. What did you learn from the total collection of your chosen datasets? Discuss the steps you took to combine your datasets. Show your results, and the work you did to produce your results.

Is there a correlation between shelter price and opioid-related deaths?¶

To start I am going to creae an SQL query that will pull all the data from the opiates data set, as well as some chosen columns from the CPI dataset and join them together in order to look at the correlations between the different variables. To do this I will need to create a subquery where I select the columns I want from the CPI dataset and group them together based on the Region year and quarter in order to be able to do a one to one join with the opioid dataset.

In [68]:
correlationdata = SQL("""select *
                        from Opiates

                        join
                        (select Region_Year_Quarter,
                        avg(Rented_accommodation) Rented_accommodation,
                        avg(Owned_accommodation) Owned_accommodation
                        from CPI_table_quarter
                        group by Region_Year_Quarter) as CPIavg
                        on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter """)

allcordata = correlationdata.drop(columns = ["Year_Quarter","Region_Year_Quarter","Accidental_stimulant_deaths","Accidental_stimulant_poisoning_ED_visits","Accidental_stimulant_poisoning_hospitalizations","Total_stimulant_poisoning_ED_visits"], inplace=True)
allcordata = correlationdata.filter(['Region','Total_opioid_toxicity_deaths', 'Rented_accommodation','Owned_accommodation'])

cancordata = allcordata.drop(columns = 'Region')

With this new dataset that we acquired from the SQL query we can now create a correlation matrix to look at the correlation values for the different variables. You can hover over each of the blocks to see each of the exact correlations, rather than just going based on color.

In [69]:
opfig = px.imshow(cancordata.corr(), color_continuous_scale='PuBu')
opfig.show()

From this graph we see that the correlation coefficients between the total toxicity deaths and rented accomodation and owned accomodation are 0.1051735 and 0.06451971 respectively. We see that as a whole the two accomodation CPIs do not have a large correlation to the total opioid deaths in the form that they currently are in. This may change looking at it by region.

To help us do this we are now going to create a new SQL query which is going to be grouped by year and quarter, as to give us total numbers across Canada for each quarter.

In [70]:
overalltotals = SQL("""select Region,
                        Year_Quarter,
                        Opiates.Region_Year_Quarter,
                        avg(Owned_accommodation) Owned_accommodation,
                        avg(Rented_accommodation) Rented_accommodation,
                        (avg(Rented_accommodation)+avg(Owned_accommodation))/2 Combined_accommodation,
                        avg(Health_care) as Health_care,
                        sum(Total_opioid_toxicity_deaths) as Total_opioid_toxicity_deaths
                        from Opiates

                        join
                        (select Region_Year_Quarter,
                        avg(Rented_accommodation) Rented_accommodation,
                        avg(Owned_accommodation) Owned_accommodation,
                        avg(Health_care) Health_care
                        from CPI_table_quarter
                        group by Region_Year_Quarter) as CPIavg
                        on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                        where region != 'Whitehorse, Yukon'
                        and region != 'Yellowknife, Northwest Territories'
                        group by Year_Quarter""")

From this query we are able to graph some things to help us better understand the data and the relationship between variables that we are looking at.

To start we are going to graph the total opioid toxicity deaths as a function of accommodation CPI so that we can see if there is any sort of relationship between the two variables.

In [71]:
opfig1 = px.scatter(overalltotals, x= ['Owned_accommodation', 'Rented_accommodation'], y = 'Total_opioid_toxicity_deaths',
                    trendline = 'ols',
                    labels = {'Total_opioid_toxicity_deaths':'<b>Total Opioid Deaths<b>', 'Owned_accommodation':'<b>Owned Accomodation CPI<b>', 'variable':'Metric', 'value':'<b>Accommodation CPI<b>'},
                    title = '<b>Total Opioid Deaths as a Function of Accomodation CPI<b>',
                    width =1100,
                    height = 400)

opfig1.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))

opfig1.update_xaxes(ticks = 'outside')
opfig1.update_yaxes(ticks = 'outside')

opfig1.update_layout(title_x=0.5, title=dict(font=dict(size=30)))
opfig1.show()

From this graph we see that there seems to be a correlation between the accommodation CPI and the total number of opioid deaths. From this interactable graph we can see that the regression line for the owned accommodation has the following formula Y = 35.986*X - 4182.25

And the regression line for the rented accommodation has the following formula Y = 48.4324*X - 4929.82

These formulas show that there is a positive correlation between deaths from opioids and accommodation CPI. With the rented accomodation CPI having a larger impact on the number of deaths from opioids. For every one unit increase in rented accommodation CPI there is a predicted increase of 48.43 deaths per quarter. This does agree with the findings that were found by Cheng and his associates where they determined that housing status affected the drug usage, especially among youths (Cheng et al.). This is something we will further investigate and see if there may be different provinces who have a stronger correlation between the two variables.

I also want to look at the combined accommodation price that was calculated in the query.

In [72]:
opfig2 = px.scatter(overalltotals, x= 'Combined_accommodation', y = 'Total_opioid_toxicity_deaths',
                    trendline = 'ols',
                    labels = {'Total_opioid_toxicity_deaths':'<b>Total Opioid Deaths<b>', 'Combined_accommodation':'<b>Combined Accomodation CPI<b>'},
                    title = '<b>Total Opioid Deaths as a Function of Owned Accomodation CPI<b>',
                    width =1100,
                    height = 400)

opfig2.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))

opfig2.update_xaxes(ticks = 'outside')
opfig2.update_yaxes(ticks = 'outside')

opfig2.update_layout(title_x=0.5, title=dict(font=dict(size=30)))

opfig2.show()

From this interactable graph we see that the linear regression model for this set of points is Y = 41.4874*X - 4528.69

This shows that for every one unit increase in the average accommodation price between owned and rented, there will be a prediced increase in deaths by opioids of 41.49 per quarter.

The following SQL statement is going to be grouped by region so that we will get a total value for the overall region. We will use this to graph all the different provinces and see if we see any correlation among them.

In [73]:
totalsperregion = SQL("""select Region,
                        Year_Quarter,
                        Opiates.Region_Year_Quarter,
                        sum(Owned_accommodation) Owned_accommodation,
                        sum(Rented_accommodation) Rented_accommodation,
                        sum(Owned_accommodation) + sum(Rented_accommodation) Both_accommodation,
                        sum(Health_care) as Health_care,
                        sum(Total_opioid_toxicity_deaths) as Total_opioid_toxicity_deaths
                        from Opiates

                        join
                        (select Region_Year_Quarter,
                        avg(Rented_accommodation) Rented_accommodation,
                        avg(Owned_accommodation) Owned_accommodation,
                        avg(Health_care) Health_care
                        from CPI_table_quarter
                        group by Region_Year_Quarter) as CPIavg
                        on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                        where region != 'Whitehorse, Yukon'
                        and region != 'Yellowknife, Northwest Territories'
                        group by Region
                        order by Total_opioid_toxicity_deaths desc""")
In [74]:
opfig3 = px.bar(totalsperregion,
                x= 'Region',
                y=['Total_opioid_toxicity_deaths','Both_accommodation'],
                title = '<b>Total Opioid Deaths Per Quarter<b>',
                labels = {'Total_opioid_toxicity_deaths':'<b>Total Opioid Deaths<b>', 'Year_Quarter':'<b>Year and Quarter<b>','value':''},
                width =1100,
                height = 500,
                barmode='group')

opfig3.update_xaxes(nticks=10, ticks = 'outside')
opfig3.update_yaxes(ticks = 'outside')

opfig3.update_layout(title_x=0.5, title=dict(font=dict(size=30)))

opfig3.show()

From this graph we see that there is no direct correlation between the total amount of opioid deaths in a province and the sum of the rented and owned accommodation CPI. This does make sense because of the fact that the CPI is not very dependant on the overall population. Whereas, the number of opioid deaths per province is quite influenced by the total population of the province. Because of this it makes sense that we do not see a correlation from this graph, we must make better visuals in order to determine if there is a correlation between these variables per region.

We are going to make a new query which will give us the data of these two tables grouped by the region year and quarter so that we can look at each provinces fluctuation through time.

In [75]:
databydate = SQL("""select region,
                    Year_Quarter,
                    avg(Owned_accommodation) Owned_accommodation,
                    avg(Rented_accommodation) Rented_accommodation,
                    (avg(Rented_accommodation)+avg(Owned_accommodation))/2 Combined_accommodation,
                    avg(Health_care) as Health_care,
                    sum(Total_opioid_toxicity_deaths) as Total_opioid_toxicity_deaths
                    from Opiates

                    join (select Region_Year_Quarter,
                    avg(Rented_accommodation) Rented_accommodation,
                    avg(Owned_accommodation) Owned_accommodation,
                    avg(Health_care) Health_care
                    from CPI_table_quarter
                    group by Region_Year_Quarter) as CPIavg
                    on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                    where region != 'Whitehorse, Yukon'
                    and region != 'Yellowknife, Northwest Territories'
                    group by Opiates.Region_Year_Quarter""")
In [76]:
opfig4 = px.scatter(databydate, x= 'Combined_accommodation', y = 'Total_opioid_toxicity_deaths',
                    color = 'region',
                    labels = {'Total_opioid_toxicity_deaths':'<b>Total Opioid Deaths<b>', 'Combined_accommodation':'<b>Combined Accomodation CPI<b>', 'region':'Region'},
                    title = '<b>Total Opioid Deaths as a Function of Combined Accomodation CPI<b>',
                    trendline = 'ols',
                    width =1100,
                    height = 400)

opfig4.update_xaxes(ticks = 'outside')
opfig4.update_yaxes(ticks = 'outside')

opfig4.update_layout(title_x=0.5, title=dict(font=dict(size=30)))

opfig4.show()

From this graph we can clearly see Alberta, British Columbia, Ontario and Quebec. But the others are quite hard to differentiate so we will create a new graph that only contains the ones that we can not see clearly in this graph

In [77]:
databydateexcl = SQL("""select region,
                        Year_Quarter,
                        avg(Owned_accommodation) Owned_accommodation,
                        avg(Rented_accommodation) Rented_accommodation,
                        (avg(Rented_accommodation)+avg(Owned_accommodation))/2 Combined_accommodation,
                        avg(Health_care) as Health_care,
                        sum(Total_opioid_toxicity_deaths) as Total_opioid_toxicity_deaths
                        from Opiates

                        join
                        (select Region_Year_Quarter,
                        avg(Rented_accommodation) Rented_accommodation,
                        avg(Owned_accommodation) Owned_accommodation,
                        avg(Health_care) Health_care
                        from CPI_table_quarter
                        group by Region_Year_Quarter) as CPIavg
                        on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                        where region != 'Whitehorse, Yukon'
                        and region != 'Yellowknife, Northwest Territories'
                        and region != 'Ontario'
                        and region != 'British Columbia'
                        and region != 'Quebec'
                        and region != 'Alberta'
                        group by Opiates.Region_Year_Quarter""")
In [78]:
opfig5 = px.scatter(databydateexcl, x= 'Combined_accommodation', y = 'Total_opioid_toxicity_deaths',
                    color = 'region',
                    labels = {'Total_opioid_toxicity_deaths':'<b>Total Opioid Deaths<b>', 'Combined_accommodation':'<b>Combined Accomodation CPI<b>', 'region':'Region'},
                    title = '<b>Total Opioid Deaths as a Function of Combined Accomodation CPI<b>',
                    trendline = 'ols',
                    width =1100,
                    height = 400)

opfig5.update_xaxes(ticks = 'outside')
opfig5.update_yaxes(ticks = 'outside')

opfig5.update_layout(title_x=0.5, title=dict(font=dict(size=30)))

opfig5.show()

From this new graph we can clearly see the ones that are the least affected by the increase in the combined accommodation price. From this Saskatchewn and Manitoba are the two who have a significant linear regression coefficient assocaited to the combined accommodation price. We will create a graph that only includes the ones who seem to have a positive association between the combined accommodation price and the total number of deaths from opioids.

In [79]:
databydate6 = SQL("""select region,
                     Year_Quarter,
                     avg(Owned_accommodation) Owned_accommodation,
                     avg(Rented_accommodation) Rented_accommodation,
                     (avg(Rented_accommodation)+avg(Owned_accommodation))/2 Combined_accommodation,
                     avg(Health_care) as Health_care,
                     sum(Total_opioid_toxicity_deaths) as Total_opioid_toxicity_deaths
                     from Opiates

                     join
                     (select Region_Year_Quarter,
                     avg(Rented_accommodation) Rented_accommodation,
                     avg(Owned_accommodation) Owned_accommodation,
                     avg(Health_care) Health_care
                     from CPI_table_quarter
                     group by Region_Year_Quarter) as CPIavg
                     on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                     where region = 'Manitoba'
                     or region = 'Ontario'
                     or region = 'British Columbia'
                     or region = 'Quebec'
                     or region = 'Alberta'
                     or region = 'Saskatchewan'
                     group by Opiates.Region_Year_Quarter""")
In [80]:
opfig6 = px.scatter(databydate6, x= 'Combined_accommodation', y = 'Total_opioid_toxicity_deaths',
                    color = 'region',
                    labels = {'Total_opioid_toxicity_deaths':'<b>Total Opioid Deaths<b>', 'Combined_accommodation':'<b>Combined Accomodation CPI<b>', 'region':'Region'},
                    title = '<b>Total Opioid Deaths as a Function of Combined Accomodation CPI<b>',
                    trendline = 'ols',
                    width =1100,
                    height = 400)

opfig6.update_xaxes(ticks = 'outside')
opfig6.update_yaxes(ticks = 'outside')

opfig6.update_layout(title_x=0.5, title=dict(font=dict(size=30)))

opfig6.show()

In this graph we have the six main provinces whose number of opioid deaths per quarter seems to have a correlation with the combined accomodation price. We can also get all of the models for these lines to specifically look at which province has the greatest association to do some further analysis.

The models are as follows

British Columbia
Y = 12.5436X - 1156.13

Ontario
Y = 11.1X - 1101.04

Alberta
Y = 8.7025X - 1131.93

Quebec
Y = 2.81662X - 301.334

Manitoba
Y = 1.4458X - 175.923

Saskatchewan
Y = 2.15943X - 304.943

From these linear models we see that British Columbia has the highest regression coefficient and thus in theory the combined accommodation CPI has the greatest impact on the number of opioid deaths per quarter. This is very good information to know because we see that although the overall dataset did not have a very strong correlation between the two metrics it does seem that some provinces have more correlation than others. We are now actually going to look at the correlation coefficient just for British Columbia.

In [81]:
correlationdatabc = SQL("""select *
                        from Opiates

                        join
                        (select Region_Year_Quarter,
                        avg(Rented_accommodation) Rented_accommodation,
                        avg(Owned_accommodation) Owned_accommodation
                        from CPI_table_quarter
                        group by Region_Year_Quarter) as CPIavg
                        on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                        where region = 'British Columbia'""")

allcordatabc = correlationdatabc.drop(columns = ["Region","Year_Quarter","Region_Year_Quarter","Accidental_stimulant_deaths","Accidental_stimulant_poisoning_ED_visits","Accidental_stimulant_poisoning_hospitalizations","Total_stimulant_poisoning_ED_visits"], inplace=True)
allcordatabc = correlationdatabc.filter(['Total_opioid_toxicity_deaths', 'Rented_accommodation','Owned_accommodation', 'Health_care'])
In [82]:
opfig7 = px.imshow(allcordatabc.corr(), color_continuous_scale='PuBu')
opfig7.show()

From this we see that the correlation coefficients between the opioid deaths and the rented and owned accommodation price are 0.80637 and 0.85728! This is a massive increase from the overall datasets correlation coefficients and clearly shows that British Columbia has a much stronger correlation between the accommodation CPIs and the deaths caused by opioids.

We are now going to graph just British Columbias metrics.

In [83]:
databydateBC = SQL("""select region,
                      Year_Quarter,
                      avg(Owned_accommodation) Owned_accommodation,
                      avg(Rented_accommodation) Rented_accommodation,
                      (avg(Rented_accommodation)+avg(Owned_accommodation))/2 Combined_accommodation,
                      avg(Health_care) as Health_care,
                      sum(Total_opioid_toxicity_deaths) as Total_opioid_toxicity_deaths
                      from Opiates

                      join
                      (select Region_Year_Quarter,
                      avg(Rented_accommodation) Rented_accommodation,
                      avg(Owned_accommodation) Owned_accommodation,
                      avg(Health_care) Health_care
                      from CPI_table_quarter
                      group by Region_Year_Quarter) as CPIavg
                      on Opiates.Region_Year_Quarter=CPIavg.Region_Year_Quarter

                      where region = 'British Columbia'
                      group by Opiates.Region_Year_Quarter""")
In [84]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(
    x=databydateBC['Year_Quarter'],
    y=databydateBC['Rented_accommodation'],
    name='Rented Accomodation Price'),
    secondary_y=False)

fig.add_trace(go.Scatter(
    x=databydateBC['Year_Quarter'],
    y=databydateBC['Owned_accommodation'],
    name='Owned Accomodation Price'),
    secondary_y=False)

fig.add_trace(go.Scatter(
    x=databydateBC['Year_Quarter'],
    y=databydateBC['Total_opioid_toxicity_deaths'],
    name='Total Opioid Toxicity Deaths'),
    secondary_y=True)

fig.add_trace(go.Scatter(
    x=databydateBC['Year_Quarter'],
    y=databydateBC['Combined_accommodation'],
    name='Combined Accommodation Price'),
    secondary_y=False)

fig.update_layout(legend=dict(yanchor="top",y=0.99, xanchor="left",x=0.01))

fig.update_xaxes(nticks=10, ticks = 'outside', title_text="<b>Year and Quarter<b>")
fig.update_layout(title_text="<b>Opioid Deaths Relationship to Accommodation CPI<b>", title_x=0.5, title=dict(font=dict(size=30)), width=1100, height=500)

fig.update_yaxes(title_text="<b>CPI<b>", ticks = 'outside',secondary_y=False)
fig.update_yaxes(title_text="<b>Total Opioid Deaths</b>", ticks = 'outside',secondary_y=True)

fig.show()

As expected due to the correlation coefficients we see that British Columbias opioid deaths and the accommodation CPIs follow a very similar pattern. They both increase through the timeframe that we are investigating. Although this does not confirm to us that there is causation between the two variables it does seem quite likely that they are being affected by one another.

Lastly, we are going to calculate all of the correlation coefficients among the provinces to see which provinces have the highest and the lowest correlation coefficients.

In [85]:
# display(abcordata.iloc[1:3, 0])
# display(bccordata.iloc[1:3, 0])
# display(manitobacordata.iloc[1:3, 0])
# display(newbcordata.iloc[1:3, 0])
# display(nfldcordata.iloc[1:3, 0])
# display(nscordata.iloc[1:3, 0])
# display(oncordata.iloc[1:3, 0])
# display(peicordata.iloc[1:3, 0])
# display(qccordata.iloc[1:3, 0])
# display(saskcordata.iloc[1:3, 0])

After examining the correlation coefficients that we have created above we actually see that the province with the highest correlation coefficient is Quebec. With British Columbia having the second highest coefficient. From this we see that although Quebec's regression coefficient was not as high as British Columbias there is still a higher correlation between the two metrics and can indiciate that there is more of a relationship between the two metrics in Quebec than in British Columbia. Although according to the models there is a stronger affect in British Columbia when the accommodation CPIs fluctuate. We see that the province with the lowest correlation coefficients is Prince Edward Island.

Are there trends in the cost of healthcare in relation to causes of death?¶

This analysis investigates whether healthcare costs from the CPI data set are related to disease-related deaths. To answer this question, the CPI and provisional death count tables will be joined on their unique monthly keys. The goal is to determine if the inability to afford treatment and other health services contributes to an increase in disease-associated mortality.

To start this analysis, the two tables were joined, and narrowed down to just Alberta. This choice was made as it carries more relevence to the class as a whole, but is also within the scope of the class.

In [86]:
general_join = pd.read_sql_query("""SELECT * FROM CPI_table_quarter
INNER JOIN death_table_month ON CPI_table_quarter.year_month_region = death_table_month.YearMonthRegion
WHERE CPI_table_quarter.province = 'Alberta';""", engine)


fig = make_subplots(rows=6, cols=2, subplot_titles=("Cost of Health Care Against Deaths From Accidents and Unintentional Injuries",
                                                    "Cost of Health Care Against Deaths From Alzheimer's Disease",
                                                    "Cost of Health Care Against Deaths From COVID-19",
                                                    "Cost of Health Care Against Deaths From Cerebrovascular Diseases",
                                                    'Cost of Health Care Against Deaths From Chronic Lower Respiratory Disease',
                                                    'Cost of Health Care Against Deaths From Diabetes Mellitus',
                                                    'Cost of Health Care Against Deaths From Diseases of The Heart',
                                                    'Cost of Health Care Against Deaths From Influenza and Pneumonia',
                                                    'Cost of Health Care Against Deaths From Intentional Self Harm',
                                                    'Cost of Health Care Against Deaths From Malignant Neoplasms',
                                                    'Cost of Health Care Against Deaths From Nephritis, Nephrotic Syndrome, and Nephrosis',
                                                   ))


fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Accidents_(Unintentional_Injuries)"], name="Cost of Health Care Against Deaths From Accidents and Unintentional Injuries", mode = "markers"),
    row=1, col=1
)

fig.update_xaxes(title_text="Health Care Costs", row=1, col=1)
fig.update_yaxes(title_text=" Number of Deaths", row=1, col=1)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Alzheimer's disease"], name= "Cost of Health Care Against Deaths From Alzheimer's Disease", mode = "markers"),
    row=1, col=2
)

fig.update_xaxes(title_text="Health Care Costs", row=1, col=2)
fig.update_yaxes(title_text=" Number of Deaths", row=1, col=2)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["COVID-19"], name="Cost of Health Care Against Deaths From COVID-19", mode = "markers"),
    row=2, col=1
)

fig.update_xaxes(title_text="Health Care Costs", row=2, col=1)
fig.update_yaxes(title_text=" Number of Deaths", row=2, col=1)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Cerebrovascular_Diseases"], name="Cost of Health Care Against Deaths From Cerebrovascular Diseases", mode = "markers"),
    row=2, col=2
)

fig.update_xaxes(title_text="Health Care Costs", row=2, col=2)
fig.update_yaxes(title_text=" Number of Deaths", row=2, col=2)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Chronic_Lower_Respiratory_Diseases"], name='Cost of Health Care Against Deaths From Chronic Lower Respiratory Disease', mode = "markers"),
    row=3, col=1
)

fig.update_xaxes(title_text="Health Care Costs", row=3, col=1)
fig.update_yaxes(title_text=" Number of Deaths", row=3, col=1)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Diabetes_Mellitus"], name='Cost of Health Care Against Deaths From Diabetes Mellitus', mode = "markers"),
    row=3, col=2
)

fig.update_xaxes(title_text="Health Care Costs", row=3, col=2)
fig.update_yaxes(title_text=" Number of Deaths", row=3, col=2)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Diseases_of_Heart"], name='Cost of Health Care Against Deaths From Diseases of The Heart', mode = "markers"),
    row=4, col=1
)

fig.update_xaxes(title_text="Health Care Costs", row=4, col=1)
fig.update_yaxes(title_text=" Number of Deaths", row=4, col=1)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Influenza_and_Pneumonia"], name='Cost of Health Care Against Deaths From Influenza and Pneumonia', mode = "markers"),
    row=4, col=2
)

fig.update_xaxes(title_text="Health Care Costs", row=4, col=2)
fig.update_yaxes(title_text=" Number of Deaths", row=4, col=2)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Suicide"], name='Cost of Health Care Against Deaths From Intentional Self Harm', mode = "markers"),
    row=5, col=1
)

fig.update_xaxes(title_text="Health Care Costs", row=5, col=1)
fig.update_yaxes(title_text=" Number of Deaths", row=5, col=1)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Malignant_Neoplasms"], name= 'Cost of Health Care Against Deaths From Malignant Neoplasms', mode = "markers"),
    row=5, col=2
)

fig.update_xaxes(title_text="Health Care Costs", row=5, col=2)
fig.update_yaxes(title_text=" Number of Deaths", row=5, col=2)

fig.add_trace(
    go.Scatter(x=general_join["Health_care"], y=general_join["Nephritis,_Nephrotic_Syndrome_and_Nephrosis"], name= 'Cost of Health Care Against Deaths From Nephritis, Nephrotic Syndrome, and Nephrosis', mode = "markers"),
    row=6, col=1
)

fig.update_xaxes(title_text="Health Care Costs", row=6, col=1)
fig.update_yaxes(title_text=" Number of Deaths", row=6, col=1)

fig.update_layout(title="Cost of Health Care Against Different Causes of Death In Alberta From 1978-2024",showlegend=False, height = 1500, width = 1300)

fig.show()

What stands out among these graphs is that diseases such as Alzheimer's and and Nephritis show no discernable trends. This is likely due to steady prevelenace rates among those affected that fluctuate at a constant rate unrelated to healthcare availability. On the other hand, viruses such as COVID-19 and influenza show slight breaks in related deaths depending on seasonal patterns, vaccine rates, and restrictions (mainly COVID).

Due to the difficulties that come from separating CPI from time, it was important to assess whether the diseases with the most promising relationship to CPI were influenced more by temporal trends, rather than healthcare costs. The following graphs create joins that pinpoint specific deaths that fall into three categories. These categories are viral diseases, non-health related deaths, and diseases that are not caused by environmental factors (such as bacterium or viruses).

In [87]:
join_alberta = pd.read_sql_query("""
    SELECT CPI_table_quarter.Health_Care,
           CPI_table_quarter.province,
           death_table_month.`Alzheimer's disease`,
           CPI_table_quarter.Year
    FROM CPI_table_quarter
    INNER JOIN death_table_month
    ON CPI_table_quarter.year_month_region = death_table_month.YearMonthRegion
    WHERE CPI_table_quarter.province = 'Alberta';
""", engine)

join_alberta['Year'] = join_alberta['Year'].dt.strftime('%Y')

join_alberta['Year'] = join_alberta['Year'].astype(int)


fig = px.scatter(
    join_alberta,
    x='Health_Care',
    y="Alzheimer's disease",
    labels={
        'Health_Care': 'Price of Health Care',
        "Alzheimer's_Disease": "Deaths From Alzheimer's Disease"
    },
    title="Price of Health Care Against Deaths From Alzheimer's Disease in Alberta",
    color='Year',  # Pass the column name directly
    color_continuous_scale= 'Bluered_r'  # Continuous color scale
)


fig.show()
In [88]:
join_alberta2 = pd.read_sql_query("""
    SELECT CPI_table_quarter.Health_Care,
           CPI_table_quarter.province,
           death_table_month.`Accidents_(Unintentional_Injuries)`,
           CPI_table_quarter.Year
    FROM CPI_table_quarter
    INNER JOIN death_table_month
    ON CPI_table_quarter.year_month_region = death_table_month.YearMonthRegion
    WHERE CPI_table_quarter.province = 'Alberta';
""", engine)

join_alberta2['Year'] = join_alberta2['Year'].dt.strftime('%Y')

join_alberta2['Year'] = join_alberta2['Year'].astype(int)



fig = px.scatter(
    join_alberta2,
    x='Health_Care',
    y="Accidents_(Unintentional_Injuries)",
    labels={
        'Health_Care': 'Price of Health Care',
        "Accidents_(Unintentional_Injuries)": "Number of Deaths"
    },
    title="Price of Health Care Against Deaths From Accidental Deaths in Alberta",
    color='Year',  # Pass the column name directly
    color_continuous_scale= 'Bluered_r'  # Continuous color scale
)



fig.show()
In [89]:
join_alberta3 = pd.read_sql_query("""
    SELECT CPI_table_quarter.Health_Care,
           CPI_table_quarter.province,
           death_table_month.`Influenza_and_Pneumonia`,
           CPI_table_quarter.Year
    FROM CPI_table_quarter
    INNER JOIN death_table_month
    ON CPI_table_quarter.year_month_region = death_table_month.YearMonthRegion
    WHERE CPI_table_quarter.province = 'Alberta';
""", engine)

join_alberta3['Year'] = join_alberta3['Year'].dt.strftime('%Y')

join_alberta3['Year'] = join_alberta3['Year'].astype(int)



fig = px.scatter(
    join_alberta3,
    x='Health_Care',
    y="Influenza_and_Pneumonia",
    labels={
        'Health_Care': 'Price of Health Care',
        "Influenza_and_Pneumonia": "Number of Deaths"
    },
    title="Price of Health Care Against Deaths From Influenza and Pneumonia in Alberta",
    color='Year',  # Pass the column name directly
    color_continuous_scale= 'Bluered_r'  # Continuous color scale
)



fig.show()

What these three graphs display aligns with the suspicion that CPI is the same as assessing deaths across time. As visualized in the bar on the right-hand side of the graphs, we can see that deaths are consistent as time goes on for deaths not caused by environmental factors (Alzheimer's disease) as prevalence is generally consistent across time. This same trend follows for viral and accidental deaths, where fluctuations occur based on time, with key trends visible depending on the year. More specifically, an increase in road accidents is visible around the CPI = 145% mark, which plateaus until CPI = 155%. On the other hand, slight gaps are visible in the influenza and pneumonia-related deaths due to the seasonal prevalence of these viruses.

In conclusion, the results from the temporal analysis revealed that when CPI is joined with the deaths table, it is equivalent to assessing the number of deaths against time. This is true of virus-related deaths, accidental deaths, and diseases. To gain more valuable insights that are directly related to the price of healthcare, an additional data set would likely be necessary. For this project, we cannot say that there is a relationship between CPI and provisional deaths in Alberta, however, these are insights we would not have discovered without completing a join on the two tables. This is a largely unsurprising result, as much more than the cost of healtchare contributes to a populations health and well-being. As such, it would be incredibly difficult to discover relationships between a particular disease and health care prices that is not multifaceted in nature.

Is there correlation between unemployment rates, opioid-related deaths and deaths caused by suicide?¶

The main goal of our final project is to explore the relationship between healthcare and socieconomic factors and how they affect regions across Canada. Each member of the group focused on answering a guiding question that involves looking at more than one dataset. I will be exploring the correlation between unemployment rates, opioid-related deaths and deaths caused by suicide. This task involves joining three datasets; Provisional Weekly Death Counts, Opioid and Stimulant-Reated Harms in Canada and Monthly Labour Force Characteristics datasets.

In [90]:
labour_query = pd.read_sql_query("""SELECT quarter, AVG(Unemployment_percent) AS Unemployment_percent
                                    FROM labour_table GROUP BY quarter""",engine)
labour_query.to_sql("labour", engine, if_exists = "replace")
display(pd.read_sql_query("SELECT * FROM labour LIMIT 5", engine))
index quarter Unemployment_percent
0 0 Alberta1976Q1 4.800000
1 1 Alberta1976Q2 3.800000
2 2 Alberta1976Q3 3.566667
3 3 Alberta1976Q4 3.700000
4 4 Alberta1977Q1 5.366667

The Opioid and Stimulant-Reated Harms in Canada dataset was recorded quarterly and therefore I will be joining my dataframes with the primary key being the RegionYearQuarter. I had to create a new table from the labour force characteristics dataset. I did this by creating a query that takes the quarter column and the average unemployment rate for each quarter and grouping it by quarter.

In [91]:
opiate_death_query = pd.read_sql_query("""SELECT death_table_quarter.RegionYearQuarter, Opiates.Region,
                                        Opiates.Year_Quarter, Opiates.Total_opioid_toxicity_deaths,
                                        death_table_quarter.Suicide FROM Opiates JOIN death_table_quarter
                                        ON death_table_quarter.RegionYearQuarter =
                                        Opiates.Region_Year_Quarter""",engine)
opiate_death_query.to_sql("opiate_death", engine, if_exists = "replace")
display(pd.read_sql_query("SELECT * FROM opiate_death LIMIT 5", engine))
index RegionYearQuarter Region Year_Quarter Total_opioid_toxicity_deaths Suicide
0 0 Alberta2016Q1 Alberta 2016 Q1 137.0 165.0
1 1 Alberta2016Q2 Alberta 2016 Q2 140.0 145.0
2 2 Alberta2016Q3 Alberta 2016 Q3 159.0 165.0
3 3 Alberta2016Q4 Alberta 2016 Q4 166.0 150.0
4 4 Alberta2017Q1 Alberta 2017 Q1 162.0 155.0

I decided to start by joining the opioids dataset and the death count dataset. I did this by selecting the region, year_quarter, total_opioid_toxicity_deaths and number of suicide deaths and joining the datasets using the primary key RegionYearQuarter. I initally did not select the RegionYearQuarter column, but realized that I needed it to do the second join.

In [92]:
join_query = pd.read_sql_query("""SELECT opiate_death.Region, opiate_death.Year_Quarter,
                                opiate_death.Total_opioid_toxicity_deaths,
                                opiate_death.Suicide, labour.Unemployment_percent FROM opiate_death
                                JOIN labour ON opiate_death.RegionYearQuarter = labour.quarter""",engine)
join_query.to_sql("final_table", engine, if_exists = "replace")
display(pd.read_sql_query("SELECT * FROM final_table LIMIT 5", engine))
index Region Year_Quarter Total_opioid_toxicity_deaths Suicide Unemployment_percent
0 0 Alberta 2016 Q1 137.0 165.0 8.000000
1 1 Alberta 2016 Q2 140.0 145.0 8.066667
2 2 Alberta 2016 Q3 159.0 165.0 8.666667
3 3 Alberta 2016 Q4 166.0 150.0 8.066667
4 4 Alberta 2017 Q1 162.0 155.0 8.866667

This query joins the labour force dataset with the newly joined opioid and death count dataset. This was done by selecting the columns needed for analysis and joining them using the same primary key RegionYearQuarter.

In [93]:
fig = px.scatter(join_query,
                    x='Unemployment_percent',
                    y='Suicide',
                    title='Unemployment Rate vs # of Suicide Related Deaths per Quarter in Canada from 2016-2024',
                    labels={'Unemployment_percent': 'Unemployment Rate (%)', 'Suicide': '# of Suicide Related Deaths'},
                    width = 1000,
                    height = 700)
fig.show()

The scatter plot above was created using the joined dataset. Is is looking at the correlation between unemployment rate and number of suicide related deaths per quarter in Canada from 2010-2024. My initial hypothesis was that there would be a positive correlation between these two variables. I thought that as unemployment rates go up and people lose their jobs, life gets harder and people become more depressed, in turn the suicide rates would go up. After analysing this scatter plot, we can conclude that we should reject my initial hypothesis and state that there seems to be no correlation between unemplyment rates and number of suicide related deaths in Canada from 2010-2024.

In [94]:
fig = px.scatter(join_query,
                    x='Unemployment_percent',
                    y='Total_opioid_toxicity_deaths',
                    title='Unemployment Rate vs # of Opioid Related Deaths per Quarter in Canada from 2016-2024',
                    labels={'Unemployment_percent': 'Unemployment Rate (%)', 'Total_opioid_toxicity_deaths': '# of Opioid Related Deaths'},
                    width = 1000,
                    height = 700)
fig.show()

This scatter plot was also created using the joined dataset. Is is looking at the correlation between unemployment rate and number of opioid related deaths per quarter in Canada from 2010-2024. Again, my initial hypothesis was that there would be a positive correlation between these two variables for the same reasons as the previous variables. This scatter plot shows no correlation between unemplyment rates and number of suicide related deaths in Canada from 2010-2024 and therefore I can conclude that my hypothesis holds no truth.

In [95]:
# unpivot the joined table
unpivotted_deaths = pd.melt(join_query, id_vars=['Year_Quarter'], value_vars=['Total_opioid_toxicity_deaths', 'Suicide'])
# upload table into SQL
unpivotted_deaths.to_sql("unpivotted_deaths", engine, if_exists = "replace")
Out[95]:
660
In [96]:
# create query to sum number of deaths (Canada) for each cause of death (suicide and opioid deaths)
canada_query = pd.read_sql_query("""SELECT Year_Quarter, variable, SUM(value)
            FROM unpivotted_deaths GROUP BY Year_Quarter, variable""", engine)

To create the the next visualiztion, I had to unpivot the joined table and uplaod the new table into SQL. Then I created a query to sum all of the deaths for each cause and for each quarter without taking region into account. This is because I wanted the total number of deaths for Canada per quarter.

In [97]:
fig = px.line(canada_query,
                    x='Year_Quarter',
                    y='SUM(value)',
                    color='variable',
                    title='Number of Deaths per Quarter in Canada from 2016-2024',
                    labels={'Year_Quarter': 'Yearly Quarter', 'SUM(value)': '# of Deaths','variable':'Cause of Death'},
                    width = 1000,
                    height = 700)
fig.show()

This graph shows the total number of deaths in Canada for each cause of death over the years 2016-2024. The most noticeable feature of the graph is the steep incline in opioid realated deaths in the year 2020. This spike is not suprising as it happens at the same time COVID-19 hit Canada and went into lockdown. There are probably other factors that are worth being explored as to why the opioid related deaths have not seen much of a decrease since the end of the pandemic.

Is there fluctuation in the price of goods as unemployment rates change, and does this differs between recreational goods such as alcohol versus necessities such as shelter?¶

To investigate this question, we decided to combine the CPI and Labour force characteristics datasets. The steps to combine these datesets are fairly simply. Since both datasets are split by region and month, we can join using these columns. Additionally, we already did most of the hard work in the cleaning steps, where we created year month region keys in the datasets. Thus, we can simply inner join the two datasets on these columns. The reason we choose to inner join is so that we only retain the dates and provinces that are in both datasets. For example, the labour dataset does not contain territories, and the CPI dataset does not have exactly the same years, so inner join ensures our combined data contains rows where there are data for both.

We join the data using a query and pull it into a pandas dataframe in the cell below.

In [98]:
# join CPI and labour on month and date
cpilab_fulljoin = pdsql("""
SELECT * FROM
    CPI_table_quarter t1 INNER JOIN labour_table t2 ON t1.year_month_region = t2.dateregion
""")
In [99]:
cpilab_fulljoin.head()
Out[99]:
year_month_region Food_purchased_from_stores Food__purchased_from_restaurants Rented_accommodation Owned_accommodation Water_fuel_and_electricity Household_operations Household_furnishings_and_equipment Clothing Footwear ... Population_thousands LabourForce_thousands Employment_thousands Fulltime_thousands Parttime_thousands Unemployment_thousands Unemployment_percent Participation_percent Employment_percent quarter
0 1978-09:Alberta 42.2 36.2 47.7 43.0 23.8 41.9 52.3 NaN 55.9 ... 1484.5 1017.9 971.7 834.2 137.5 46.3 4.5 68.6 65.5 Alberta1978Q3
1 1978-09:British Columbia 41.0 35.0 45.9 61.9 33.1 39.8 49.5 NaN 47.5 ... 1983.3 1251.0 1162.1 989.6 172.5 88.9 7.1 63.1 58.6 British Columbia1978Q3
2 1978-09:Manitoba 43.2 34.2 45.5 41.2 31.3 39.3 45.3 NaN 42.4 ... 755.3 478.4 447.6 382.7 64.8 30.8 6.4 63.3 59.3 Manitoba1978Q3
3 1978-09:New Brunswick 42.9 33.8 48.6 43.6 28.7 41.4 54.2 NaN 51.5 ... 500.1 282.6 253.2 224.2 29.0 29.3 10.4 56.5 50.6 New Brunswick1978Q3
4 1978-09:Newfoundland and Labrador NaN 36.2 53.7 43.5 31.1 46.7 55.7 NaN 53.8 ... 383.2 208.6 180.0 164.8 15.2 28.6 13.7 54.4 47.0 Newfoundland and Labrador1978Q3

5 rows × 41 columns

We can also combine the goods and services columns to get a total CPI, which we can use to quasi adjust for inflation.

In [100]:
cpilab_fulljoin["TotalCPI"] = cpilab_fulljoin["Goods"] + cpilab_fulljoin["Services"]

For example, by taking the CPI for recreational goods and activities and taking it as a percentage of the total CPI, we can see how the relative price of recreation has changed over the years.

In [101]:
cpilab_fulljoin["recreation_percent"] = 100 * cpilab_fulljoin["Recreation"] / cpilab_fulljoin["TotalCPI"]
In [102]:
# PLOT: recreation cost against participation, for alberta
multi_line_plot(
    cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta'],
    columntuples = [('date', 'Participation_percent'), ('date','recreation_percent')],
    serieslabels = ['Participation Rate (%)', 'Recreation CPI / Total CPI (%)'],
    xlabel = 'Date',
    ylabel = "Percentage",
    title = "Recreation and Participation Rate over time in Alberta"
)

Here we see that the participation rate in alberta appears to vary seasonally due to the small spikes every summer. However in general, it appears to mainly stay steady at around 70%, except for in 2020 with the effect of the covid 19 pandemic, where there is a sharp dropoff where people are choosing not to work (Participation rate is different from employment, so this is just the dropoff of people who are capable of working but are choosing not to look for work). In contrast, we see that the CPI for recreation (including things like toys and games, but also events like concerts, sports games, vacations, etc) has gotten relatively cheaper over time. This was calculated by taking the ratio of recreational CPI to the combined CPI of goods and services as a sort of quasi method of accounting for inflation. While one might expect that if recreational goods are especially cheap there may be more desire to take time off work, etc. (or if you are out of work, to not look for a while and just relax), but this does not seem to be the case here.

Next we can look at another 'optional' or luxury good, eating out at restaurants. Again we look at Alberta specifically. Here, we plot it against unemployment. We again take the CPI as a percentage of total CPI to account for increasing costs in general.

In [103]:
# PLOT: restaurant cost against unemployment, for alberta
custom_scatter(
    xdata = cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Unemployment_percent'],
    ydata = 100* cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']["Food__purchased_from_restaurants"]
                / cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']["TotalCPI"],
    colordata = [int(x.strftime('%Y')) for x in cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['date']],
    cbartitle = 'Year',
    titlelabel = 'Alberta Restaurant Prices vs Unemployment',
    xlabel = 'Unemployment (%)',
    ylabel = 'Food From Restaurants CPI / Total CPI (%)',
    xrange = [0,20]
)

In the above plot, we can see there does not seem to be much if any relationship between the two. We can see a grouping of points above 52 percent on the y-axis. These points all appear to be after ~2015. From this, it appears that the biggest relationship with restaurant prices seems to be with year, with costs exploding after 2015. This is unlikely to have anything to do with unemployment, as we can see that there is a spread of unemployment levels in the top grouping. Since restaurant margins are generally quite thin, this likely better reflects an increase in input costs for restaurants such as rent or food supply cost increases.

We can also do similar analysis on neccesities, to see if there are any correlations with unemployment in that case.

In [104]:
# PLOT -> grocery prices vs unemployment
custom_scatter(
    xdata = cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Unemployment_percent'],
    ydata = 100* cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Food_purchased_from_stores']
               / cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['TotalCPI'],
    colordata = [int(x.strftime('%Y')) for x in cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['date']],
    cbartitle = 'Year',
    titlelabel = 'Alberta Grocery Prices vs Unemployment',
    xlabel = 'Unemployment (%)',
    ylabel = 'Food From Stores CPI / Total CPI (%)',
    xrange = [0,20]
)

With the exception of the late 70's and early 80's grouping in the top left, it appears there is a general upwards trend in the data. This could be explained moreso by periods of higher unemployment and higher grocery prices both coming from a third source, such as periods of recession, etc. Rathar than people finding more work when their neccesities become more expensive, which does not appear to be the case.

We can also look at things such as rental accommodations:

In [105]:
# PLOT -> rental prices vs unemployment
custom_scatter(
    xdata = cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Unemployment_percent'],
    ydata = 100* cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Rented_accommodation']
               / cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['TotalCPI'],
    colordata = [int(x.strftime('%Y')) for x in cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['date']],
    cbartitle = 'Year',
    titlelabel = 'Alberta Rental Prices vs Unemployment',
    xlabel = 'Unemployment (%)',
    ylabel = 'Rented Accommodation CPI / Total CPI (%)',
    xrange = [0,20]
)

We can see that in a given date range, the values seem to stay at a consistant CPI, and the spread is mostly over unemployment. That is, even when unemployment is shifting, it appears rental accomodation stays fairly consistent, with no apparantly affect from unemployment. One reason this could be the case is that regardless of work status, people still have to pay their rent, so there seems to not be much effect.

Next we can look at a correlation matrix to see if there is anything else interesting to investigate.

In [106]:
# correlation matrix for Alberta (rows are CPI, columns are Labour)
# select only numeric columns (no region, date, etc)
cpilabnumeric = cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta'].select_dtypes(include='float64')
cpilabcorrelation = cpilabnumeric.corr() # calculate correlations
cpilabcorrelation.iloc[:23,23:-3].style.background_gradient(cmap='coolwarm') # select which rows/cols and color
Out[106]:
  Population_thousands LabourForce_thousands Employment_thousands Fulltime_thousands Parttime_thousands Unemployment_thousands Unemployment_percent Participation_percent
Food_purchased_from_stores 0.983042 0.973369 0.962707 0.949621 0.944780 0.592327 -0.103181 -0.106262
Food__purchased_from_restaurants 0.989953 0.980292 0.970003 0.955271 0.958442 0.592234 -0.118700 -0.113703
Rented_accommodation 0.986066 0.989075 0.983938 0.972313 0.958249 0.547147 -0.165262 0.004779
Owned_accommodation 0.990646 0.985920 0.975818 0.967491 0.936873 0.593251 -0.112225 -0.076395
Water_fuel_and_electricity 0.967121 0.962610 0.962654 0.955216 0.920964 0.484053 -0.210397 -0.063072
Household_operations 0.972622 0.977735 0.969139 0.954626 0.956727 0.574678 -0.120378 0.029188
Household_furnishings_and_equipment 0.733563 0.731778 0.722064 0.692260 0.792731 0.461649 -0.020770 0.042581
Clothing -0.230552 -0.216890 -0.178751 -0.194896 -0.092290 -0.465478 -0.402347 0.175307
Footwear 0.457617 0.469421 0.461780 0.429022 0.564600 0.309740 0.036137 0.184096
Clothing_accessories_watches_jewellery 0.972415 0.965579 0.954175 0.942341 0.920771 0.538609 -0.187909 -0.282393
Clothing_material_notions_services 0.993039 0.989750 0.979894 0.966946 0.960076 0.592835 -0.122280 -0.059012
Public_transportation 0.981546 0.974417 0.968042 0.956739 0.942198 0.551678 -0.162592 -0.088523
Gasoline 0.938011 0.942250 0.946236 0.945393 0.878044 0.435917 -0.225858 0.022472
Private_transportation 0.978003 0.974161 0.971676 0.959544 0.949053 0.514165 -0.190427 -0.050520
Health_care 0.971659 0.975083 0.967201 0.951916 0.958182 0.566486 -0.130042 0.012459
Personal_care 0.965894 0.960494 0.950837 0.932703 0.955045 0.576203 -0.103747 -0.056431
Recreation 0.832613 0.845995 0.845734 0.825198 0.868012 0.428319 -0.171422 0.151770
Education_and_reading 0.983447 0.981271 0.981373 0.968590 0.949805 0.438444 -0.334741 -0.236719
Alcoholic_beverages 0.972135 0.974639 0.968068 0.952125 0.961752 0.553676 -0.149584 0.006219
Tobacco_smokers_supplies 0.980379 0.966909 0.955765 0.943663 0.934223 0.593726 -0.114581 -0.153437
Recreational_cannabis -0.687308 -0.482901 -0.421765 -0.370882 -0.406958 0.128972 0.177343 0.473292
Goods 0.948628 0.949313 0.946622 0.930237 0.943793 0.503650 -0.171152 0.009129
Services 0.997018 0.994540 0.986713 0.975869 0.957528 0.575741 -0.144557 -0.052488

From the above, we can see that most of the labour force characteristics in units of thousands correlate with most of the CPI data. This is likely due to both population and CPI increasing over time. There are a few exceptions. Such as Cannabis, which has a different reference point and was only legally introduced recently. As well as clothing and footwear, which seem to have gotten cheaper recently, likely due to outsourcing production and the fast fashion business, which we can check below. We also note that there does not seem to be any decent correlation with any sort of rates and any of the other CPI data.

For clothing, we can plot a scatterplot as before:

In [107]:
# PLOT -> clothing prices vs unemployment
custom_scatter(
    xdata = cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Unemployment_percent'],
    ydata = cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['Clothing']
               / cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['TotalCPI'],
    colordata = [int(x.strftime('%Y')) for x in cpilab_fulljoin[cpilab_fulljoin.region == 'Alberta']['date']],
    cbartitle = 'Year',
    titlelabel = 'Alberta Clothing Prices vs Unemployment',
    xlabel = 'Unemployment (%)',
    ylabel = 'Clothing CPI / Total CPI (%)',
    xrange = [0,20]
)

Here we see a very interesting crescent pattern to the data. There seem to be two prongs, the upper containing data before ~2000, and the lower containing data after ~2000. Again we see that the CPI seems more affected by time than anything else. Mainly, we can see the affect of the fast fashion business model, where businesses sell bulk, cheap clothing. This can be seen in the staggering difference of almost 40% between the early data and the late data. However, there still does not seem to be any sort of relationship with unemployment rate.

Guiding Question Conclusion¶

We set out to answer the question of whether fluctuations in unemployment rates and price of goods had any sort of relationship, and whether this differed between recreational goods and neccesities. By joining the datasets on matching province and month, we can get a snapshot of the unemployment rates and relative price of goods in that region and time. We analyzed the distribution of these points and found that regardless of the category of the product, there did not seem to be any relationship between CPI and unemployment rates. This is likely because this is a complex system, and there are many factors that effect both the price of goods and services and unemployment rates. For the most part, it seems that both of these factors are more related to the year than anything else, which is likely due to external factors such as the covid 19 pandemic or other world events that occur in specific years, causing a knock on effect to things like the unemployment or the price of goods and services.

Discussion¶

Talk about what each individual learned from this project. Are there technologies or techniques you feel would be a better fit now, a particular set of tasks you would do differently now, or ways that you would extend this project in the future?

Maxwell
I think that the biggest thing I learned in this project is the ability to think about datasets in a different manner. In this project picking datasets was one of the hardest parts and on our first attempt we did not correctly pick datasets that could be easily joined together due to the non unique nature of each entry. This slight hiccup that we encountered forced us to look at the datasets we wanted to pick much closer and make sure that there were ways that we could create primary keys for the dataset that match other datasets to join too. This helped me learn the concept of pivoting data and how to do it both through SQL and pandas in order to turn our long format data into wide format data where it was much easier to create a unique key that would match to the other datasets.

This problem with the joining of datasets culminated in us figuring out a way to join datasets that were across different time periods. This involved adding new columns to the datasets with different time frames associated to them and grouping based on those columns when needed. For example when I was working with the opioid dataset and the CPI dataset, I had to group the CPI dataset based on quarter, region and year before I could actually join it to the opioid dataset because it was in quartesr as well. I thought that I had a decent grasp on joining datasets together before this class but it has definitely helped solidify that understanding by a large amount.

In the future I think it would be very interesting to extend this project to include more datasets and have more analysis between the data. I think that the wonderful part about this project is that our datasets are only being connected to eachother by the region and date. This makes it quite easy to be able to connect many other datasets as long as they are based in Canada and have time associated to them. And now that I understand that we can group datasets before joining them, we would not even need the time frames (days, months, quarters etc) to be consistent.

Kennedy

The most valuable insight I gained from this project was recognizing the multitude of ways to approach and solve data-oriented questions. For instance, the CPI and provisional deaths join may also be completed using something like Pandas. However, the ability to parse through large data sets cannot be understated as a valuable function of working with SQL. Additionally, the ability to accomplish a single task using multiple methods, or combining multiple methods is an invaluable skill. In real-world scenarios, there may be requirements to conduct analyses using specific tools or coding languages, so being able to switch between them is incredibly valuable.

As with the projects in earlier classes, I once again faced challenges with how to display data most intuitively. The ability to use both Pandas and SQL made this data manipulation more straightforward, as it allowed for querying only the information that was necessary for a given question.

As of right now, I don't think that there was a particular technology that would have been a better fit for this project. With my current skill set and knowledge base, I found that SQL was incredibly straightforward to use, which made understanding our large data sets more clear.

There are several ways I would extend this project in the future. Based on what I learned from joining the CPI and the provisional death datasets, I would instead choose a dataset that has not been adjusted to be relative to 2002. This adjustment made concluding the actual price of goods for different types of deaths unsuccessful. I would instead opt for a dataset where the prices reflect the actual prices of goods and services over time (and that has been adjusted for inflation), which would reveal clearer trends and relationships with other variables. Additionally, since the questions we chose to analyze depend on so many other factors than what was explored, it may be interesting to add additional datasets to our analysis to get a more thorough and full understanding of how health and socioeconomic factors interact.

Noah

During this project, I learned and put into practice a number of practical skills, such as handling very large datasets by breaking them into chunks, joining data from multiple tables and pivoting data in SQL. These are all important things that will likely come up again, and working on them during a project where there is a specific goal in mind helped me to solidify my understanding.

I think a technique that we could use next time would be to use multiple columns in conjunction as the primary key, instead of manually creating a single new column with the same information. When we were originally figuring out how to join the datasets together, I did not realize this was an option and thought that it had to be a single column. This would avoid the problem of making sure all of the datasets had the new column created in the same way and would save space in some of the larger datasets. Another technique would be to specify the table creation manually using SQL commands instead of using pandas, which would allow us to be more specific with things like the amount of space allocated to each column, or to specify which columns to be the primary key or to create an index.

I think this project could be extended by finding datasets that are more relevant to each other, or more specific. With large datasets and complex topics, it can be hard to find relationships between two factors like we were trying to do here.

Bobbi

This project allowed me to develop skills and techniques learned in class. Starting out in week one we picked our datasets without any previous knowledge of how to join them together. Fast forward to today, and we have completed an in-depth analysis of four datasets individually and joined together.

Our first challenge was to be able to find four datasets that had a unique identifier. Our initial attempt at choosing four datasets with a unique key did not work out and we ended up having to find a few new datasets after our project pitch. Our idea of merging the region and date to create a unique key worked out extremely well.

Throughout the creation of the project, I followed along with my group members' work. One of the things that interested me the most was seeing how we all chose to clean our data differently. Some group members used panadas while others used SQL. I would like to try cleaning in SQL now that I am more familiar with it, but I still believe cleaning in panadas was the best choice for this project as I was able to keep my code minimal and clean. I enjoyed working with this group because they taught me a lot and really pushed me to fully explore the data.

Lastly, I would like to extend the project by bringing in a dataset pertaining to COVID-19. The line graph, Number of Deaths per Quarter in Canada from 2016-2024, showed a sharp increase in opioid-related deaths. I would like to see if my assumption is correct that this increase is due to the COVID-19 Pandemic and all of the lockdowns we had to endure. Our project covers a large range of topics and therefore it can go in many different ways bringing in many different datasets with respect to Canadian healthcare and socioeconomic factors.

Conclusion¶

Summarize the project, especially the parts you feel were most noteworthy.

In conclusion, we see that overall there is not always a strong correlation between the number of opioid deaths and the accommodation CPIs. However, there are large differences in the correlation among provinces. The provinces with the smallest and basically negligible regression coefficients are PEI, Nova Scotia, Newfoundland and Labrador and New Brunswick. These provinces also have some of the lowest correlation coefficients when compared to the other provinces. The province with the highest regression coefficient for the model is British Columbia. However, the province with the highest correlation coefficients is British Columbia. We can conclude that there does seem to be a correlation between the number of opioid deaths and the accommodation CPIs depending on which province you are examining.

When looking at the relationship between CPI and provisional weekly death counts, we found that there were no definitive conclusions that could be made. More specifically, the cost of goods in the CPI data set more accurately represents a time scale than the costs of goods and services. To more accurately answer the guiding question: are there trends in the price of healthcare about causes of death, a new data set could be employed. Additionally, future work could include Canada-wide analyses to compare deaths for healthcare prices across provinces. To complete such a task, death counts must be converted to rates to account for population differences across regions. However, the insights that were gained for the joins between these two data sets was incredibley valuable for gaining an understanding of the nuances of trying to make insights between two originally unrelated data sets.

In terms of the relationship between Labour Force Characteristics and Consumer Price Index, we found that there was no relationship between any of the variables we investigated. This includes the relationships between unemployment rates and various goods and services, as well as participation rates and other labour characteristics. We found that there was some correlation between CPI and labour characteristics in raw numbers (eg thousands of people), but this is more due to population and inflation both increasing over time than any actual relationship. The most likely reason we cannot find any relationships is because unemployment and consumer price index are such complex topics which vary in response to a myriad of different factors, and any single variable would only explain a small part of the variation.

When exploring the correlation between unemployment rates, opioid-related deaths, and deaths caused by suicide, we found that there is no correlation between these variables. To take a deeper look into the data we decided to look at the number of deaths for each cause of death in Canada to see if there are any trends. We observed a sharp increase in opioid-related deaths in 2020, which became the most prevalent cause of death between deaths caused by suicide and deaths caused by opioids. These variables could also be dependent on each other and therefore, there may be an increase in opioid-related deaths but this could also be considered a cause of suicide. It would be interesting to note how the data is collected in order to maintain data integrity.

References¶

Alex Nae and Mark Barnes. “Economic Policy Uncertainty and US Equity Volatility.” LSEG, 22 Oct. 2024, https://www.lseg.com/en/insights/ftse-russell/economic-policy-uncertainty-and-us-equity-volatility.

Alleckna, M. (2024, February 2). Why Alberta Electricity Bills are Getting Higher — and What you can do About it. EnergyRates.ca. https://energyrates.ca/why-alberta-electricity-bills-are-getting-higher-and-what-you-can-do-about-it/

Canada, Health. Modelling Opioid-Related Deaths during the Overdose Crisis. 26 Nov. 2020, https://www.canada.ca/en/health-canada/services/opioids/data-surveillance-research/modelling.html.

Cheng, Tessa, et al. “Increases and Decreases in Drug Use Attributed to Housing Status among Street-Involved Youth in a Canadian Setting.” Harm Reduction Journal, vol. 11, Apr. 2014, p. 12. PubMed Central, https://doi.org/10.1186/1477-7517-11-12.

Classen, Timothy J., and Richard A. Dunn. “The Effect of Job Loss and Unemployment Duration on Suicide Risk in the United States: A New Look Using Mass-Layoffs and Unemployment Duration.” Health Economics, vol. 21, no. 3, Mar. 2012, pp. 338–50. PubMed Central, https://doi.org/10.1002/hec.1719.

Giesbrecht, N., Wettlaufer, A., Thomas, G., Stockwell, T., Thompson, K., April, N., Asbridge, M., Cukier, S., Mann, R., McAllister, J., Murie, A., Pauley, C., Plamondon, L., & Vallance, K. (2015). Pricing of alcohol in Canada: A comparison of provincial policies and harm‐reduction opportunities. Drug and Alcohol Review, 35(3), 289–297. https://doi.org/10.1111/dar.12338

Secretariat, T. B. of C., & Open Government, T. B. S. of C. (n.d.). Open government licence—Canada. Retrieved December 4, 2024, from http://open.canada.ca/en/open-government-licence-canada

Government of Canada, Statistics Canada. (2024, October 15). Consumer Price Index, monthly, not seasonally adjusted. https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1810000401

Government of Canada, Statistics Canada. (2024, October 11). Labour force characteristics, monthly, seasonally adjusted and trend-cycle, last 5 months. https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701

Government of Canada, Statistics Canada. (2024, October 10). Provisional weekly death counts, by selected grouped causes of death. https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310081001

Provinces and territories—Canada. (n.d.). Retrieved December 3, 2024, from https://data.opendatasoft.com/explore/dataset/georef-canada-province@public/information/?disjunctive.prov_name_en

How to pivot rows into columns MySQL. (n.d.). Database Administrators Stack Exchange. https://dba.stackexchange.com/questions/164711/how-to-pivot-rows-into-columns-mysql

Public Health Agency of Canada. (2024, September 13). Opioid- and Stimulant-related Harms in Canada: Key findings — Canada.ca. https://health-infobase.canada.ca/substance-related-harms/opioids-stimulants/

Vandoros, Sotiris, and Ichiro Kawachi. “Economic Uncertainty and Suicide in the United States.” European Journal of Epidemiology, vol. 36, no. 6, 2021, pp. 641–47. PubMed Central, https://doi.org/10.1007/s10654-021-00770-4.

“Plotly.” Plotly Open Source Graphing Libraries, https://plotly.com/api/. Accessed 4 Dec. 2024.

pandas.DataFrame.to_sql — pandas 2.2.3 documentation. (n.d.). https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html

W3Schools.com. (n.d.). https://www.w3schools.com/mysql/mysql_limit.asp

Renaming column names in Pandas. (n.d.). Stack Overflow. https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas

Trying to remove every character inside brackets “[. . .].” (n.d.). Stack Overflow. https://stackoverflow.com/questions/67059532/trying-to-remove-every-character-inside-brackets

Stedman, C. (2024, June 4). What is data preparation? an in-depth guide: TechTarget. Search Business Analytics. https://www.techtarget.com/searchbusinessanalytics/definition/data-preparation#:~:text=One%20of%20the%20main%20purposes,reconciled%20when%20they’re%20combined.

In [108]:
# clear up resources
engine.dispose()